Skip to content

Commit 09f3b33

Browse files
committed
corrected call parameters for occupancy_max_potential_block_size and added rusty wrapper for LaunchConfig
1 parent 3e5d38b commit 09f3b33

4 files changed

Lines changed: 159 additions & 8 deletions

File tree

examples/10-function-attributes.rs

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use cudarc::{
2-
driver::{CudaContext, DriverError},
2+
driver::{CudaContext, DriverError, LaunchConfig, SharedMemoryConfig},
33
nvrtc::Ptx,
44
};
55

@@ -56,16 +56,21 @@ fn main() -> Result<(), DriverError> {
5656
println!();
5757

5858
// Use occupancy API to get optimal launch configuration
59-
extern "C" fn no_dynamic_smem(_block_size: std::ffi::c_int) -> usize {
60-
0
61-
}
6259
let (min_grid_size, block_size) =
63-
sin_kernel.occupancy_max_potential_block_size(no_dynamic_smem, 0, 0, None)?;
60+
sin_kernel.occupancy_max_potential_block_size(None, 0, 0, None)?;
6461

6562
println!("=== Optimal Launch Configuration (sin_kernel) ===");
6663
println!(" Suggested block size: {}", block_size);
6764
println!(" Min grid size: {}", min_grid_size);
6865
println!(" Total threads per grid: {}", min_grid_size * block_size);
6966

67+
// Or use the wrapper to get the suggested launch configuration for n elements
68+
let n = 999_999;
69+
let launch_config = LaunchConfig::suggested(n, &sin_kernel, None, SharedMemoryConfig::none())?;
70+
71+
println!(" === Optimal Launch Configuration for {n} elements (sin_kernel) ===");
72+
println!(" Suggested block size: {}", launch_config.block_dim.0);
73+
println!(" grid size: {}", launch_config.grid_dim.0);
74+
println!(" Total threads per grid: {}", launch_config.block_dim.0 * launch_config.grid_dim.0);
7075
Ok(())
7176
}

src/driver/safe/core.rs

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2331,9 +2331,44 @@ impl CudaFunction {
23312331
Ok(num_clusters as u32)
23322332
}
23332333

2334+
/// Suggest a launch configuration with reasonable occupancy.
2335+
///
2336+
/// Returns a block size that can achieve the maximum occupancy (or, the maximum number of
2337+
/// active warps with the fewest blocks per multiprocessor), along with the minimum grid size
2338+
/// needed to achieve that maximum occupancy.
2339+
///
2340+
/// If `block_size_limit` is 0, the maximum block size permitted by the device/function is used.
2341+
///
2342+
/// ### Dynamic Shared Memory
2343+
///
2344+
/// - If dynamic shared memory is **not** needed, pass `None` for `block_size_to_dynamic_smem_size`
2345+
/// and `0` for `dynamic_smem_size`.
2346+
/// - If dynamic shared memory is **constant** regardless of block size, pass `None` for the
2347+
/// callback and the constant size in `dynamic_smem_size`.
2348+
/// - If dynamic shared memory **varies** with block size, provide a callback via
2349+
/// `block_size_to_dynamic_smem_size` that computes the required shared memory for any given
2350+
/// block size. The `dynamic_smem_size` parameter is ignored in this case.
2351+
///
2352+
/// ### Flags
2353+
///
2354+
/// - `CU_OCCUPANCY_DEFAULT` — default behavior.
2355+
/// - `CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE` — guarantees that the returned launch configuration
2356+
/// is global caching compatible, at a potential cost of occupancy.
2357+
///
2358+
/// # Returns
2359+
///
2360+
/// A tuple of `(min_grid_size, block_size)`.
2361+
///
2362+
/// # Errors
2363+
///
2364+
/// This function may also return error codes from previous, asynchronous launches.
2365+
///
2366+
/// # See Also
2367+
///
2368+
/// [`cudaOccupancyMaxPotentialBlockSizeWithFlags`](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__HIGHLEVEL.html#group__CUDART__HIGHLEVEL_1gd0524825c5c01bbc9a5e29e890745800)
23342369
pub fn occupancy_max_potential_block_size(
23352370
&self,
2336-
block_size_to_dynamic_smem_size: extern "C" fn(block_size: std::ffi::c_int) -> usize,
2371+
block_size_to_dynamic_smem_size: Option<unsafe extern "C" fn(block_size: std::ffi::c_int) -> usize>,
23372372
dynamic_smem_size: usize,
23382373
block_size_limit: u32,
23392374
flags: Option<sys::CUoccupancy_flags_enum>,
@@ -2347,7 +2382,7 @@ impl CudaFunction {
23472382
&mut min_grid_size,
23482383
&mut block_size,
23492384
self.cu_function,
2350-
Some(block_size_to_dynamic_smem_size),
2385+
block_size_to_dynamic_smem_size,
23512386
dynamic_smem_size,
23522387
block_size_limit as std::ffi::c_int,
23532388
flags as std::ffi::c_uint,
@@ -2392,6 +2427,18 @@ impl CudaFunction {
23922427
Ok(cluster_size as u32)
23932428
}
23942429

2430+
/// Returns the underlying CUDA function object.
2431+
/// Use at your own risk.
2432+
///
2433+
/// # Warning
2434+
///
2435+
/// The returned handle is only valid as long as the [CudaModule] that loaded
2436+
/// this function remains loaded. Using the handle after the module is unloaded
2437+
/// may cause undefined behavior or a driver error.
2438+
pub fn cu_function(&self) -> sys::CUfunction {
2439+
self.cu_function
2440+
}
2441+
23952442
/// Get the value of a specific attribute of this [CudaFunction].
23962443
///
23972444
/// See [CUDA docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b)

src/driver/safe/launch.rs

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,50 @@ pub struct LaunchConfig {
2323
pub shared_mem_bytes: u32,
2424
}
2525

26+
/// Shared memory configuration for the calculation of the kernel launch configuration.
27+
/// See [LaunchConfig::suggested] for info.
28+
///
29+
/// # Safety
30+
///
31+
/// The `Dynamic` variant contains an unsafe `extern "C"` function to calculate the shared memory
32+
/// size in bytes for a given block size.
33+
/// This function is passed directly to the CUDA driver.
34+
/// The caller must guarantee that this function returns valid smem values for all reasonable
35+
/// block sizes.
36+
#[derive(Clone, Copy, Debug)]
37+
pub enum SharedMemoryConfig {
38+
Fixed(usize),
39+
Dynamic(unsafe extern "C" fn (block_size: std::ffi::c_int) -> usize),
40+
}
41+
42+
impl SharedMemoryConfig {
43+
/// For functions with no shared memory.
44+
pub fn none() -> Self {
45+
Self::Fixed(0)
46+
}
47+
48+
fn with_block_size(&self, block_size: u32) -> u32 {
49+
match self {
50+
Self::Fixed(val) => {
51+
debug_assert!(*val <= u32::MAX as usize, "shared memory size exceeds u32::MAX");
52+
*val as u32
53+
}
54+
Self::Dynamic(func) => unsafe {
55+
let smem = func(block_size as std::ffi::c_int);
56+
debug_assert!(smem <= u32::MAX as usize, "dynamic shared memory size exceeds u32::MAX");
57+
smem as u32
58+
},
59+
}
60+
}
61+
}
62+
2663
impl LaunchConfig {
2764
/// Creates a [LaunchConfig] with:
2865
/// - block_dim == `1024`
2966
/// - grid_dim == `(n + 1023) / 1024`
3067
/// - shared_mem_bytes == `0`
3168
pub fn for_num_elems(n: u32) -> Self {
69+
debug_assert!(n > 0, "n must be greater than 0");
3270
const NUM_THREADS: u32 = 1024;
3371
let num_blocks = n.div_ceil(NUM_THREADS);
3472
Self {
@@ -37,6 +75,67 @@ impl LaunchConfig {
3775
shared_mem_bytes: 0,
3876
}
3977
}
78+
79+
pub fn for_block_size(n: u32, block_size: u32, smem: SharedMemoryConfig) -> Self {
80+
debug_assert!(n > 0, "n must be greater than 0");
81+
debug_assert!(block_size > 0, "block size must be greater than 0");
82+
let num_blocks = n.div_ceil(block_size);
83+
Self {
84+
grid_dim: (num_blocks, 1, 1),
85+
block_dim: (block_size, 1, 1),
86+
shared_mem_bytes: smem.with_block_size(block_size),
87+
}
88+
}
89+
90+
/// Calculates a launch configuration that _should_ yield a reasonable occupancy on the GPU.
91+
///
92+
/// # Performance Considerations
93+
///
94+
/// Note that the values returned by this function are based on calculations done by the
95+
/// driver, provided the loadout of the cuda function, the shared memory specifications, and
96+
/// current hardware.
97+
/// In many cases the configuration provided by this will *not* be the absolute optimum, as
98+
/// GPU performance can be very unpredictable, especially if scheduling of multiple concurrent
99+
/// kernels becomes important.
100+
/// Always benchmark your kernels if you want optimal performance!
101+
/// This is more of a 'good enough for most cases' situation.
102+
pub fn suggested(
103+
n: u32,
104+
func: &CudaFunction,
105+
block_size_limit: Option<u32>,
106+
smem: SharedMemoryConfig,
107+
) -> Result<Self, DriverError> {
108+
debug_assert!(n > 0, "n must be greater than 0");
109+
let (min_grid_size, block_size, shared_mem_bytes) = match smem {
110+
SharedMemoryConfig::Fixed(smem_size) => {
111+
let (g, b) = func.occupancy_max_potential_block_size(
112+
None,
113+
smem_size,
114+
block_size_limit.unwrap_or(0),
115+
None,
116+
)?;
117+
debug_assert!(smem_size <= u32::MAX as usize, "shared memory size exceeds u32::MAX");
118+
(g, b, smem_size as u32)
119+
}
120+
SharedMemoryConfig::Dynamic(block_size_to_smem_size) => {
121+
let (g, b) = func.occupancy_max_potential_block_size(
122+
Some(block_size_to_smem_size),
123+
0,
124+
block_size_limit.unwrap_or(0),
125+
None,
126+
)?;
127+
let smem = unsafe { block_size_to_smem_size(b as std::ffi::c_int) };
128+
debug_assert!(smem <= u32::MAX as usize, "dynamic shared memory size exceeds u32::MAX");
129+
(g, b, smem as u32)
130+
}
131+
};
132+
let grid_size = u32::max(min_grid_size, n.div_ceil(block_size));
133+
Ok(Self {
134+
block_dim: (block_size, 1, 1),
135+
grid_dim: (grid_size, 1, 1),
136+
shared_mem_bytes,
137+
})
138+
}
40139
}
41140

42141
/// The kernel launch builder. Instantiate with [CudaStream::launch_builder()], and then

src/driver/safe/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ pub use self::core::{
1414
};
1515
pub use self::external_memory::{ExternalMemory, MappedBuffer};
1616
pub use self::graph::CudaGraph;
17-
pub use self::launch::{LaunchArgs, LaunchConfig, PushKernelArg};
17+
pub use self::launch::{LaunchArgs, LaunchConfig, SharedMemoryConfig, PushKernelArg};
1818
pub use self::profile::{profiler_start, profiler_stop, Profiler};
1919
pub use self::unified_memory::{UnifiedSlice, UnifiedView, UnifiedViewMut};
2020
pub use crate::driver::result::DriverError;

0 commit comments

Comments
 (0)