corrected call parameters for occupancy_max_potential_block_size and added rusty wrapper for LaunchConfig

LateinCecer · LateinCecer · commit 09f3b33ce3b2 · 2026-06-17T17:07:50.000+02:00
diff --git a/examples/10-function-attributes.rs b/examples/10-function-attributes.rs
@@ -1,5 +1,5 @@
 use cudarc::{
-    driver::{CudaContext, DriverError},
+    driver::{CudaContext, DriverError, LaunchConfig, SharedMemoryConfig},
     nvrtc::Ptx,
 };
 
@@ -56,16 +56,21 @@ fn main() -> Result<(), DriverError> {
     println!();
 
     // Use occupancy API to get optimal launch configuration
-    extern "C" fn no_dynamic_smem(_block_size: std::ffi::c_int) -> usize {
-        0
-    }
     let (min_grid_size, block_size) =
-        sin_kernel.occupancy_max_potential_block_size(no_dynamic_smem, 0, 0, None)?;
+        sin_kernel.occupancy_max_potential_block_size(None, 0, 0, None)?;
 
     println!("=== Optimal Launch Configuration (sin_kernel) ===");
     println!("  Suggested block size:     {}", block_size);
     println!("  Min grid size:            {}", min_grid_size);
     println!("  Total threads per grid:   {}", min_grid_size * block_size);
 
+    // Or use the wrapper to get the suggested launch configuration for n elements
+    let n = 999_999;
+    let launch_config = LaunchConfig::suggested(n, &sin_kernel, None, SharedMemoryConfig::none())?;
+
+    println!(" === Optimal Launch Configuration for {n} elements (sin_kernel) ===");
+    println!("  Suggested block size:     {}", launch_config.block_dim.0);
+    println!("  grid size:                {}", launch_config.grid_dim.0);
+    println!("  Total threads per grid:   {}", launch_config.block_dim.0 * launch_config.grid_dim.0);
     Ok(())
 }
diff --git a/src/driver/safe/core.rs b/src/driver/safe/core.rs
@@ -2331,9 +2331,44 @@ impl CudaFunction {
         Ok(num_clusters as u32)
     }
 
+    /// Suggest a launch configuration with reasonable occupancy.
+    ///
+    /// Returns a block size that can achieve the maximum occupancy (or, the maximum number of
+    /// active warps with the fewest blocks per multiprocessor), along with the minimum grid size
+    /// needed to achieve that maximum occupancy.
+    ///
+    /// If `block_size_limit` is 0, the maximum block size permitted by the device/function is used.
+    ///
+    /// ### Dynamic Shared Memory
+    ///
+    /// - If dynamic shared memory is **not** needed, pass `None` for `block_size_to_dynamic_smem_size`
+    ///   and `0` for `dynamic_smem_size`.
+    /// - If dynamic shared memory is **constant** regardless of block size, pass `None` for the
+    ///   callback and the constant size in `dynamic_smem_size`.
+    /// - If dynamic shared memory **varies** with block size, provide a callback via
+    ///   `block_size_to_dynamic_smem_size` that computes the required shared memory for any given
+    ///   block size. The `dynamic_smem_size` parameter is ignored in this case.
+    ///
+    /// ### Flags
+    ///
+    /// - `CU_OCCUPANCY_DEFAULT` — default behavior.
+    /// - `CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE` — guarantees that the returned launch configuration
+    ///   is global caching compatible, at a potential cost of occupancy.
+    ///
+    /// # Returns
+    ///
+    /// A tuple of `(min_grid_size, block_size)`.
+    ///
+    /// # Errors
+    ///
+    /// This function may also return error codes from previous, asynchronous launches.
+    ///
+    /// # See Also
+    ///
+    /// [`cudaOccupancyMaxPotentialBlockSizeWithFlags`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__HIGHLEVEL.html#group__CUDART__HIGHLEVEL_1gd0524825c5c01bbc9a5e29e890745800)
     pub fn occupancy_max_potential_block_size(
         &self,
-        block_size_to_dynamic_smem_size: extern "C" fn(block_size: std::ffi::c_int) -> usize,
+        block_size_to_dynamic_smem_size: Option<unsafe extern "C" fn(block_size: std::ffi::c_int) -> usize>,
         dynamic_smem_size: usize,
         block_size_limit: u32,
         flags: Option<sys::CUoccupancy_flags_enum>,
@@ -2347,7 +2382,7 @@ impl CudaFunction {
                 &mut min_grid_size,
                 &mut block_size,
                 self.cu_function,
-                Some(block_size_to_dynamic_smem_size),
+                block_size_to_dynamic_smem_size,
                 dynamic_smem_size,
                 block_size_limit as std::ffi::c_int,
                 flags as std::ffi::c_uint,
@@ -2392,6 +2427,18 @@ impl CudaFunction {
         Ok(cluster_size as u32)
     }
 
+    /// Returns the underlying CUDA function object.
+    /// Use at your own risk.
+    ///
+    /// # Warning
+    ///
+    /// The returned handle is only valid as long as the [CudaModule] that loaded
+    /// this function remains loaded. Using the handle after the module is unloaded
+    /// may cause undefined behavior or a driver error.
+    pub fn cu_function(&self) -> sys::CUfunction {
+        self.cu_function
+    }
+
     /// Get the value of a specific attribute of this [CudaFunction].
     ///
     /// See [CUDA docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b)
diff --git a/src/driver/safe/launch.rs b/src/driver/safe/launch.rs
@@ -23,12 +23,50 @@ pub struct LaunchConfig {
     pub shared_mem_bytes: u32,
 }
 
+/// Shared memory configuration for the calculation of the kernel launch configuration.
+/// See [LaunchConfig::suggested] for info.
+///
+/// # Safety
+///
+/// The `Dynamic` variant contains an unsafe `extern "C"` function to calculate the shared memory
+/// size in bytes for a given block size.
+/// This function is passed directly to the CUDA driver.
+/// The caller must guarantee that this function returns valid smem values for all reasonable
+/// block sizes.
+#[derive(Clone, Copy, Debug)]
+pub enum SharedMemoryConfig {
+    Fixed(usize),
+    Dynamic(unsafe extern "C" fn (block_size: std::ffi::c_int) -> usize),
+}
+
+impl SharedMemoryConfig {
+    /// For functions with no shared memory.
+    pub fn none() -> Self {
+        Self::Fixed(0)
+    }
+
+    fn with_block_size(&self, block_size: u32) -> u32 {
+        match self {
+            Self::Fixed(val) => {
+                debug_assert!(*val <= u32::MAX as usize, "shared memory size exceeds u32::MAX");
+                *val as u32
+            }
+            Self::Dynamic(func) => unsafe {
+                let smem = func(block_size as std::ffi::c_int);
+                debug_assert!(smem <= u32::MAX as usize, "dynamic shared memory size exceeds u32::MAX");
+                smem as u32
+            },
+        }
+    }
+}
+
 impl LaunchConfig {
     /// Creates a [LaunchConfig] with:
     /// - block_dim == `1024`
     /// - grid_dim == `(n + 1023) / 1024`
     /// - shared_mem_bytes == `0`
     pub fn for_num_elems(n: u32) -> Self {
+        debug_assert!(n > 0, "n must be greater than 0");
         const NUM_THREADS: u32 = 1024;
         let num_blocks = n.div_ceil(NUM_THREADS);
         Self {
@@ -37,6 +75,67 @@ impl LaunchConfig {
             shared_mem_bytes: 0,
         }
     }
+
+    pub fn for_block_size(n: u32, block_size: u32, smem: SharedMemoryConfig) -> Self {
+        debug_assert!(n > 0, "n must be greater than 0");
+        debug_assert!(block_size > 0, "block size must be greater than 0");
+        let num_blocks = n.div_ceil(block_size);
+        Self {
+            grid_dim: (num_blocks, 1, 1),
+            block_dim: (block_size, 1, 1),
+            shared_mem_bytes: smem.with_block_size(block_size),
+        }
+    }
+
+    /// Calculates a launch configuration that _should_ yield a reasonable occupancy on the GPU.
+    ///
+    /// # Performance Considerations
+    ///
+    /// Note that the values returned by this function are based on calculations done by the
+    /// driver, provided the loadout of the cuda function, the shared memory specifications, and
+    /// current hardware.
+    /// In many cases the configuration provided by this will *not* be the absolute optimum, as
+    /// GPU performance can be very unpredictable, especially if scheduling of multiple concurrent
+    /// kernels becomes important.
+    /// Always benchmark your kernels if you want optimal performance!
+    /// This is more of a 'good enough for most cases' situation.
+    pub fn suggested(
+        n: u32,
+        func: &CudaFunction,
+        block_size_limit: Option<u32>,
+        smem: SharedMemoryConfig,
+    ) -> Result<Self, DriverError> {
+        debug_assert!(n > 0, "n must be greater than 0");
+        let (min_grid_size, block_size, shared_mem_bytes) = match smem {
+            SharedMemoryConfig::Fixed(smem_size) => {
+                let (g, b) = func.occupancy_max_potential_block_size(
+                    None,
+                    smem_size,
+                    block_size_limit.unwrap_or(0),
+                    None,
+                )?;
+                debug_assert!(smem_size <= u32::MAX as usize, "shared memory size exceeds u32::MAX");
+                (g, b, smem_size as u32)
+            }
+            SharedMemoryConfig::Dynamic(block_size_to_smem_size) => {
+                let (g, b) = func.occupancy_max_potential_block_size(
+                    Some(block_size_to_smem_size),
+                    0,
+                    block_size_limit.unwrap_or(0),
+                    None,
+                )?;
+                let smem = unsafe { block_size_to_smem_size(b as std::ffi::c_int) };
+                debug_assert!(smem <= u32::MAX as usize, "dynamic shared memory size exceeds u32::MAX");
+                (g, b, smem as u32)
+            }
+        };
+        let grid_size = u32::max(min_grid_size, n.div_ceil(block_size));
+        Ok(Self {
+            block_dim: (block_size, 1, 1),
+            grid_dim: (grid_size, 1, 1),
+            shared_mem_bytes,
+        })
+    }
 }
 
 /// The kernel launch builder. Instantiate with [CudaStream::launch_builder()], and then
diff --git a/src/driver/safe/mod.rs b/src/driver/safe/mod.rs
@@ -14,7 +14,7 @@ pub use self::core::{
 };
 pub use self::external_memory::{ExternalMemory, MappedBuffer};
 pub use self::graph::CudaGraph;
-pub use self::launch::{LaunchArgs, LaunchConfig, PushKernelArg};
+pub use self::launch::{LaunchArgs, LaunchConfig, SharedMemoryConfig, PushKernelArg};
 pub use self::profile::{profiler_start, profiler_stop, Profiler};
 pub use self::unified_memory::{UnifiedSlice, UnifiedView, UnifiedViewMut};
 pub use crate::driver::result::DriverError;