From 29fab7ab0bfb7f6d0d8bf16681f937210060de7c Mon Sep 17 00:00:00 2001
From: David Maseda Neira <david.masedan@gmail.com>
Date: Sat, 1 Nov 2025 10:35:33 +0100
Subject: [PATCH 1/3] Add safe API for CUDA constant memory operations

---
 examples/09-constant-memory.rs |  74 ++++++++++++++++++++++++
 examples/constant_memory.cu    |  19 +++++++
 examples/constant_memory.ptx   |  60 ++++++++++++++++++++
 src/driver/result.rs           |  18 ++++++
 src/driver/safe/core.rs        | 100 +++++++++++++++++++++++++++++++++
 5 files changed, 271 insertions(+)
 create mode 100644 examples/09-constant-memory.rs
 create mode 100644 examples/constant_memory.cu
 create mode 100644 examples/constant_memory.ptx
diff --git a/examples/09-constant-memory.rs b/examples/09-constant-memory.rs
new file mode 100644
index 00000000..8b40af2e
--- /dev/null
+++ b/examples/09-constant-memory.rs
@@ -0,0 +1,74 @@
+use cudarc::{
+    driver::{CudaContext, DriverError, LaunchConfig, PushKernelArg},
+    nvrtc::Ptx,
+};
+
+fn main() -> Result<(), DriverError> {
+    let ctx = CudaContext::new(0)?;
+    let stream = ctx.default_stream();
+
+    // Load the module containing the kernel with constant memory
+    let module = ctx.load_module(Ptx::from_file("./examples/constant_memory.ptx"))?;
+
+    // Get the constant memory symbol
+    let coefficients_symbol = module.get_global("coefficients")?;
+    println!(
+        "Constant memory symbol 'coefficients' has {} bytes",
+        coefficients_symbol.num_bytes()
+    );
+
+    // Set up polynomial coefficients: 1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3
+    let coefficients = [1.0f32, 2.0, 3.0, 4.0];
+
+    // Copy coefficients to constant memory
+    stream.memcpy_htos(&coefficients, &coefficients_symbol)?;
+
+    // Load the kernel function
+    let polynomial_kernel = module.load_function("polynomial_kernel")?;
+
+    // Prepare input data
+    let input = vec![0.0f32, 1.0, 2.0, 3.0, 4.0, 5.0];
+    let n = input.len();
+
+    // Copy input to device
+    let input_dev = stream.memcpy_stod(&input)?;
+    let mut output_dev = stream.alloc_zeros::<f32>(n)?;
+
+    // Launch kernel
+    let cfg = LaunchConfig::for_num_elems(n as u32);
+    unsafe {
+        stream
+            .launch_builder(&polynomial_kernel)
+            .arg(&mut output_dev)
+            .arg(&input_dev)
+            .arg(&(n as i32))
+            .launch(cfg)
+    }?;
+
+    // Copy results back
+    let output = stream.memcpy_dtov(&output_dev)?;
+
+    // Verify results
+    println!("\nPolynomial evaluation (1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3):");
+    for (i, (&x, &y)) in input.iter().zip(output.iter()).enumerate() {
+        let expected = coefficients[0]
+            + coefficients[1] * x
+            + coefficients[2] * x * x
+            + coefficients[3] * x * x * x;
+        println!(
+            "  f({:.1}) = {:.1} (expected {:.1})",
+            x, y, expected
+        );
+        assert!(
+            (y - expected).abs() < 1e-4,
+            "Mismatch at index {}: got {}, expected {}",
+            i,
+            y,
+            expected
+        );
+    }
+
+    println!("\nAll results match expected values!");
+
+    Ok(())
+}
diff --git a/examples/constant_memory.cu b/examples/constant_memory.cu
new file mode 100644
index 00000000..ed2d4f7d
--- /dev/null
+++ b/examples/constant_memory.cu
@@ -0,0 +1,19 @@
+// Constant memory - faster than global memory for read-only data
+// accessed by all threads
+__constant__ float coefficients[4];
+
+extern "C" __global__ void polynomial_kernel(
+    float *out,
+    const float *inp,
+    int numel
+) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < numel) {
+        float x = inp[i];
+        // Compute polynomial: coefficients[0] + coefficients[1]*x + coefficients[2]*x^2 + coefficients[3]*x^3
+        out[i] = coefficients[0] +
+                 coefficients[1] * x +
+                 coefficients[2] * x * x +
+                 coefficients[3] * x * x * x;
+    }
+}
diff --git a/examples/constant_memory.ptx b/examples/constant_memory.ptx
new file mode 100644
index 00000000..f13d2992
--- /dev/null
+++ b/examples/constant_memory.ptx
@@ -0,0 +1,60 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-36424714
+// Cuda compilation tools, release 13.0, V13.0.88
+// Based on NVVM 7.0.1
+//
+
+.version 9.0
+.target sm_75
+.address_size 64
+
+	// .globl	polynomial_kernel
+.const .align 4 .b8 coefficients[16];
+
+.visible .entry polynomial_kernel(
+	.param .u64 polynomial_kernel_param_0,
+	.param .u64 polynomial_kernel_param_1,
+	.param .u32 polynomial_kernel_param_2
+)
+{
+	.reg .pred 	%p<2>;
+	.reg .f32 	%f<12>;
+	.reg .b32 	%r<6>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [polynomial_kernel_param_0];
+	ld.param.u64 	%rd2, [polynomial_kernel_param_1];
+	ld.param.u32 	%r2, [polynomial_kernel_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r3, %r4, %r5;
+	setp.ge.s32 	%p1, %r1, %r2;
+	@%p1 bra 	$L__BB0_2;
+
+	cvta.to.global.u64 	%rd3, %rd2;
+	mul.wide.s32 	%rd4, %r1, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.const.f32 	%f1, [coefficients+4];
+	ld.global.f32 	%f2, [%rd5];
+	ld.const.f32 	%f3, [coefficients];
+	fma.rn.f32 	%f4, %f2, %f1, %f3;
+	ld.const.f32 	%f5, [coefficients+8];
+	mul.f32 	%f6, %f2, %f5;
+	fma.rn.f32 	%f7, %f2, %f6, %f4;
+	ld.const.f32 	%f8, [coefficients+12];
+	mul.f32 	%f9, %f2, %f8;
+	mul.f32 	%f10, %f2, %f9;
+	fma.rn.f32 	%f11, %f2, %f10, %f7;
+	cvta.to.global.u64 	%rd6, %rd1;
+	add.s64 	%rd7, %rd6, %rd4;
+	st.global.f32 	[%rd7], %f11;
+
+$L__BB0_2:
+	ret;
+
+}
+
diff --git a/src/driver/result.rs b/src/driver/result.rs
index 3111f4e0..499e6da3 100644
--- a/src/driver/result.rs
+++ b/src/driver/result.rs
@@ -965,6 +965,24 @@ pub mod module {
         Ok(func.assume_init())
     }
 
+    /// Returns a pointer to a global/constant symbol in the module.
+    ///
+    /// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1gf3e43972c23c2d5c8a662f2d9a4d0c24)
+    ///
+    /// # Safety
+    /// `module` must be a properly allocated and not freed module.
+    pub unsafe fn get_global(
+        module: sys::CUmodule,
+        name: CString,
+    ) -> Result<(sys::CUdeviceptr, usize), DriverError> {
+        let name_ptr = name.as_c_str().as_ptr();
+        let mut dptr = MaybeUninit::uninit();
+        let mut bytes = MaybeUninit::uninit();
+        sys::cuModuleGetGlobal_v2(dptr.as_mut_ptr(), bytes.as_mut_ptr(), module, name_ptr)
+            .result()?;
+        Ok((dptr.assume_init(), bytes.assume_init()))
+    }
+
     /// Unloads a module.
     ///
     /// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b)
diff --git a/src/driver/safe/core.rs b/src/driver/safe/core.rs
index 97a55982..e3b7dcd8 100644
--- a/src/driver/safe/core.rs
+++ b/src/driver/safe/core.rs
@@ -1298,6 +1298,62 @@ impl CudaStream {
         unsafe { result::memcpy_dtod_async(dst, src, num_bytes, self.cu_stream) }
     }
 
+    /// Copy a `[T]`/`Vec<T>`/[`PinnedHostSlice<T>`] to a global/constant symbol.
+    ///
+    /// This is used to copy data into `__constant__` memory declared in CUDA kernels.
+    ///
+    /// # Example
+    ///
+    /// ```ignore
+    /// // In CUDA: __constant__ float my_const[256];
+    /// let symbol = module.get_global("my_const")?;
+    /// let data = vec![1.0f32; 256];
+    /// stream.memcpy_htos(&data, &symbol)?;
+    /// ```
+    pub fn memcpy_htos<T: DeviceRepr, Src: HostSlice<T> + ?Sized>(
+        self: &Arc<Self>,
+        src: &Src,
+        symbol: &CudaSymbol,
+    ) -> Result<(), DriverError> {
+        let src_bytes = std::mem::size_of::<T>() * src.len();
+        assert!(
+            symbol.bytes >= src_bytes,
+            "Symbol size ({} bytes) is smaller than source data ({} bytes)",
+            symbol.bytes,
+            src_bytes
+        );
+        let (src, _record_src) = unsafe { src.stream_synced_slice(self) };
+        unsafe { result::memcpy_htod_async(symbol.cu_device_ptr, src, self.cu_stream) }
+    }
+
+    /// Copy a [`CudaSlice`]/[`CudaView`] to a global/constant symbol.
+    ///
+    /// This is used to copy data into `__constant__` memory declared in CUDA kernels.
+    ///
+    /// # Example
+    ///
+    /// ```ignore
+    /// // In CUDA: __constant__ float my_const[256];
+    /// let symbol = module.get_global("my_const")?;
+    /// let device_data = stream.memcpy_stod(&vec![1.0f32; 256])?;
+    /// stream.memcpy_dtos(&device_data, &symbol)?;
+    /// ```
+    pub fn memcpy_dtos<T, Src: DevicePtr<T>>(
+        self: &Arc<Self>,
+        src: &Src,
+        symbol: &CudaSymbol,
+    ) -> Result<(), DriverError> {
+        let src_bytes = src.num_bytes();
+        assert!(
+            symbol.bytes >= src_bytes,
+            "Symbol size ({} bytes) is smaller than source data ({} bytes)",
+            symbol.bytes,
+            src_bytes
+        );
+        let (src, _record_src) = src.device_ptr(self);
+        unsafe { result::memcpy_dtod_async(symbol.cu_device_ptr, src, src_bytes, self.cu_stream) }
+    }
+
     /// Copy a [`CudaSlice`]/[`CudaView`] to a new [`CudaSlice`].
     pub fn clone_dtod<T: DeviceRepr, Src: DevicePtr<T>>(
         self: &Arc<Self>,
@@ -1750,6 +1806,50 @@ impl CudaModule {
             module: self.clone(),
         })
     }
+
+    /// Gets a global/constant symbol from the loaded module.
+    ///
+    /// This can be used to access `__constant__` memory declared in CUDA kernels.
+    ///
+    /// # Example
+    ///
+    /// ```ignore
+    /// // In CUDA: __constant__ float my_const[256];
+    /// let symbol = module.get_global("my_const")?;
+    /// stream.memcpy_htos(&data, &symbol)?;
+    /// ```
+    pub fn get_global(self: &Arc<Self>, name: &str) -> Result<CudaSymbol, DriverError> {
+        let name_c = CString::new(name).unwrap();
+        let (cu_device_ptr, bytes) =
+            unsafe { result::module::get_global(self.cu_module, name_c) }?;
+        Ok(CudaSymbol {
+            cu_device_ptr,
+            bytes,
+            module: self.clone(),
+        })
+    }
+}
+
+/// Wrapper around a global/constant symbol from a CUDA module.
+///
+/// Created with [CudaModule::get_global()]. Use [CudaStream::memcpy_htos()]
+/// or [CudaStream::memcpy_dtos()] to copy data to the symbol.
+#[derive(Debug, Clone)]
+pub struct CudaSymbol {
+    pub(crate) cu_device_ptr: sys::CUdeviceptr,
+    pub(crate) bytes: usize,
+    #[allow(unused)]
+    pub(crate) module: Arc<CudaModule>,
+}
+
+unsafe impl Send for CudaSymbol {}
+unsafe impl Sync for CudaSymbol {}
+
+impl CudaSymbol {
+    /// Returns the size of the symbol in bytes.
+    pub fn num_bytes(&self) -> usize {
+        self.bytes
+    }
 }
 
 impl CudaFunction {

From 9a8045f7dfd8ec756b3769a847bf03dcc80ae5ee Mon Sep 17 00:00:00 2001
From: David Maseda Neira <david.masedan@gmail.com>
Date: Sat, 1 Nov 2025 10:41:21 +0100
Subject: [PATCH 2/3] Don't panic on invalid name

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 src/driver/safe/core.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/driver/safe/core.rs b/src/driver/safe/core.rs
index e3b7dcd8..00657450 100644
--- a/src/driver/safe/core.rs
+++ b/src/driver/safe/core.rs
@@ -1819,7 +1819,7 @@ impl CudaModule {
     /// stream.memcpy_htos(&data, &symbol)?;
     /// ```
     pub fn get_global(self: &Arc<Self>, name: &str) -> Result<CudaSymbol, DriverError> {
-        let name_c = CString::new(name).unwrap();
+        let name_c = CString::new(name).map_err(|_| DriverError(sys::cudaError_enum::CUDA_ERROR_INVALID_VALUE))?;
         let (cu_device_ptr, bytes) =
             unsafe { result::module::get_global(self.cu_module, name_c) }?;
         Ok(CudaSymbol {

From 0b7fb979dc8bf0967279bed24d01ea5fa42feb7a Mon Sep 17 00:00:00 2001
From: David Maseda Neira <david.masedan@gmail.com>
Date: Sat, 1 Nov 2025 16:58:58 +0100
Subject: [PATCH 3/3] Moved get_global to return CudaSlice<u8>

---
 examples/09-constant-memory.rs |  17 +++--
 src/driver/safe/core.rs        | 110 +++++++--------------------------
 2 files changed, 29 insertions(+), 98 deletions(-)

diff --git a/examples/09-constant-memory.rs b/examples/09-constant-memory.rs
index 8b40af2e..2a4d3e50 100644
--- a/examples/09-constant-memory.rs
+++ b/examples/09-constant-memory.rs
@@ -10,18 +10,20 @@ fn main() -> Result<(), DriverError> {
     // Load the module containing the kernel with constant memory
     let module = ctx.load_module(Ptx::from_file("./examples/constant_memory.ptx"))?;
 
-    // Get the constant memory symbol
-    let coefficients_symbol = module.get_global("coefficients")?;
+    // Get the constant memory symbol as a CudaSlice<u8>
+    let mut coefficients_symbol = module.get_global("coefficients", &stream)?;
     println!(
         "Constant memory symbol 'coefficients' has {} bytes",
-        coefficients_symbol.num_bytes()
+        coefficients_symbol.len()
     );
 
     // Set up polynomial coefficients: 1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3
     let coefficients = [1.0f32, 2.0, 3.0, 4.0];
 
-    // Copy coefficients to constant memory
-    stream.memcpy_htos(&coefficients, &coefficients_symbol)?;
+    // Transmute the symbol to f32 and copy coefficients to constant memory
+    let mut symbol_view = coefficients_symbol.as_view_mut();
+    let mut symbol_f32 = unsafe { symbol_view.transmute_mut::<f32>(4).unwrap() };
+    stream.memcpy_htod(&coefficients, &mut symbol_f32)?;
 
     // Load the kernel function
     let polynomial_kernel = module.load_function("polynomial_kernel")?;
@@ -55,10 +57,7 @@ fn main() -> Result<(), DriverError> {
             + coefficients[1] * x
             + coefficients[2] * x * x
             + coefficients[3] * x * x * x;
-        println!(
-            "  f({:.1}) = {:.1} (expected {:.1})",
-            x, y, expected
-        );
+        println!("  f({:.1}) = {:.1} (expected {:.1})", x, y, expected);
         assert!(
             (y - expected).abs() < 1e-4,
             "Mismatch at index {}: got {}, expected {}",
diff --git a/src/driver/safe/core.rs b/src/driver/safe/core.rs
index 00657450..b854155b 100644
--- a/src/driver/safe/core.rs
+++ b/src/driver/safe/core.rs
@@ -1298,62 +1298,6 @@ impl CudaStream {
         unsafe { result::memcpy_dtod_async(dst, src, num_bytes, self.cu_stream) }
     }
 
-    /// Copy a `[T]`/`Vec<T>`/[`PinnedHostSlice<T>`] to a global/constant symbol.
-    ///
-    /// This is used to copy data into `__constant__` memory declared in CUDA kernels.
-    ///
-    /// # Example
-    ///
-    /// ```ignore
-    /// // In CUDA: __constant__ float my_const[256];
-    /// let symbol = module.get_global("my_const")?;
-    /// let data = vec![1.0f32; 256];
-    /// stream.memcpy_htos(&data, &symbol)?;
-    /// ```
-    pub fn memcpy_htos<T: DeviceRepr, Src: HostSlice<T> + ?Sized>(
-        self: &Arc<Self>,
-        src: &Src,
-        symbol: &CudaSymbol,
-    ) -> Result<(), DriverError> {
-        let src_bytes = std::mem::size_of::<T>() * src.len();
-        assert!(
-            symbol.bytes >= src_bytes,
-            "Symbol size ({} bytes) is smaller than source data ({} bytes)",
-            symbol.bytes,
-            src_bytes
-        );
-        let (src, _record_src) = unsafe { src.stream_synced_slice(self) };
-        unsafe { result::memcpy_htod_async(symbol.cu_device_ptr, src, self.cu_stream) }
-    }
-
-    /// Copy a [`CudaSlice`]/[`CudaView`] to a global/constant symbol.
-    ///
-    /// This is used to copy data into `__constant__` memory declared in CUDA kernels.
-    ///
-    /// # Example
-    ///
-    /// ```ignore
-    /// // In CUDA: __constant__ float my_const[256];
-    /// let symbol = module.get_global("my_const")?;
-    /// let device_data = stream.memcpy_stod(&vec![1.0f32; 256])?;
-    /// stream.memcpy_dtos(&device_data, &symbol)?;
-    /// ```
-    pub fn memcpy_dtos<T, Src: DevicePtr<T>>(
-        self: &Arc<Self>,
-        src: &Src,
-        symbol: &CudaSymbol,
-    ) -> Result<(), DriverError> {
-        let src_bytes = src.num_bytes();
-        assert!(
-            symbol.bytes >= src_bytes,
-            "Symbol size ({} bytes) is smaller than source data ({} bytes)",
-            symbol.bytes,
-            src_bytes
-        );
-        let (src, _record_src) = src.device_ptr(self);
-        unsafe { result::memcpy_dtod_async(symbol.cu_device_ptr, src, src_bytes, self.cu_stream) }
-    }
-
     /// Copy a [`CudaSlice`]/[`CudaView`] to a new [`CudaSlice`].
     pub fn clone_dtod<T: DeviceRepr, Src: DevicePtr<T>>(
         self: &Arc<Self>,
@@ -1807,51 +1751,39 @@ impl CudaModule {
         })
     }
 
-    /// Gets a global/constant symbol from the loaded module.
+    /// Gets a global/constant symbol from the loaded module as a [CudaSlice<u8>].
     ///
     /// This can be used to access `__constant__` memory declared in CUDA kernels.
+    /// The returned slice can be transmuted to the appropriate type via views.
     ///
     /// # Example
     ///
     /// ```ignore
-    /// // In CUDA: __constant__ float my_const[256];
-    /// let symbol = module.get_global("my_const")?;
-    /// stream.memcpy_htos(&data, &symbol)?;
+    /// // In CUDA: __constant__ float my_const[4];
+    /// let symbol = module.get_global("my_const", &stream)?;
+    /// let mut symbol_view = symbol.as_view_mut();
+    /// let mut symbol_f32 = unsafe { symbol_view.transmute_mut::<f32>(4).unwrap() };
+    /// stream.memcpy_htod(&[1.0f32, 2.0, 3.0, 4.0], &mut symbol_f32)?;
     /// ```
-    pub fn get_global(self: &Arc<Self>, name: &str) -> Result<CudaSymbol, DriverError> {
-        let name_c = CString::new(name).map_err(|_| DriverError(sys::cudaError_enum::CUDA_ERROR_INVALID_VALUE))?;
-        let (cu_device_ptr, bytes) =
-            unsafe { result::module::get_global(self.cu_module, name_c) }?;
-        Ok(CudaSymbol {
+    pub fn get_global(
+        self: &Arc<Self>,
+        name: &str,
+        stream: &Arc<CudaStream>,
+    ) -> Result<CudaSlice<u8>, DriverError> {
+        let name_c =
+            CString::new(name).map_err(|_| DriverError(sys::CUresult::CUDA_ERROR_INVALID_VALUE))?;
+        let (cu_device_ptr, bytes) = unsafe { result::module::get_global(self.cu_module, name_c) }?;
+        Ok(CudaSlice {
             cu_device_ptr,
-            bytes,
-            module: self.clone(),
+            len: bytes,
+            read: None,
+            write: None,
+            stream: stream.clone(),
+            marker: PhantomData,
         })
     }
 }
 
-/// Wrapper around a global/constant symbol from a CUDA module.
-///
-/// Created with [CudaModule::get_global()]. Use [CudaStream::memcpy_htos()]
-/// or [CudaStream::memcpy_dtos()] to copy data to the symbol.
-#[derive(Debug, Clone)]
-pub struct CudaSymbol {
-    pub(crate) cu_device_ptr: sys::CUdeviceptr,
-    pub(crate) bytes: usize,
-    #[allow(unused)]
-    pub(crate) module: Arc<CudaModule>,
-}
-
-unsafe impl Send for CudaSymbol {}
-unsafe impl Sync for CudaSymbol {}
-
-impl CudaSymbol {
-    /// Returns the size of the symbol in bytes.
-    pub fn num_bytes(&self) -> usize {
-        self.bytes
-    }
-}
-
 impl CudaFunction {
     pub fn occupancy_available_dynamic_smem_per_block(
         &self,