Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 73 additions & 0 deletions examples/09-constant-memory.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
use cudarc::{
driver::{CudaContext, DriverError, LaunchConfig, PushKernelArg},
nvrtc::Ptx,
};

fn main() -> Result<(), DriverError> {
let ctx = CudaContext::new(0)?;
let stream = ctx.default_stream();

// Load the module containing the kernel with constant memory
let module = ctx.load_module(Ptx::from_file("./examples/constant_memory.ptx"))?;

// Get the constant memory symbol as a CudaSlice<u8>
let mut coefficients_symbol = module.get_global("coefficients", &stream)?;
println!(
"Constant memory symbol 'coefficients' has {} bytes",
coefficients_symbol.len()
);

// Set up polynomial coefficients: 1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3
let coefficients = [1.0f32, 2.0, 3.0, 4.0];

// Transmute the symbol to f32 and copy coefficients to constant memory
let mut symbol_view = coefficients_symbol.as_view_mut();
let mut symbol_f32 = unsafe { symbol_view.transmute_mut::<f32>(4).unwrap() };
stream.memcpy_htod(&coefficients, &mut symbol_f32)?;

// Load the kernel function
let polynomial_kernel = module.load_function("polynomial_kernel")?;

// Prepare input data
let input = vec![0.0f32, 1.0, 2.0, 3.0, 4.0, 5.0];
let n = input.len();

// Copy input to device
let input_dev = stream.memcpy_stod(&input)?;
let mut output_dev = stream.alloc_zeros::<f32>(n)?;

// Launch kernel
let cfg = LaunchConfig::for_num_elems(n as u32);
unsafe {
stream
.launch_builder(&polynomial_kernel)
.arg(&mut output_dev)
.arg(&input_dev)
.arg(&(n as i32))
.launch(cfg)
}?;

// Copy results back
let output = stream.memcpy_dtov(&output_dev)?;

// Verify results
println!("\nPolynomial evaluation (1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3):");
for (i, (&x, &y)) in input.iter().zip(output.iter()).enumerate() {
let expected = coefficients[0]
+ coefficients[1] * x
+ coefficients[2] * x * x
+ coefficients[3] * x * x * x;
println!(" f({:.1}) = {:.1} (expected {:.1})", x, y, expected);
assert!(
(y - expected).abs() < 1e-4,
"Mismatch at index {}: got {}, expected {}",
i,
y,
expected
);
}

println!("\nAll results match expected values!");

Ok(())
}
19 changes: 19 additions & 0 deletions examples/constant_memory.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Constant memory - faster than global memory for read-only data
// accessed by all threads
__constant__ float coefficients[4];

extern "C" __global__ void polynomial_kernel(
float *out,
const float *inp,
int numel
) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < numel) {
float x = inp[i];
// Compute polynomial: coefficients[0] + coefficients[1]*x + coefficients[2]*x^2 + coefficients[3]*x^3
out[i] = coefficients[0] +
coefficients[1] * x +
coefficients[2] * x * x +
coefficients[3] * x * x * x;
}
}
60 changes: 60 additions & 0 deletions examples/constant_memory.ptx
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-36424714
// Cuda compilation tools, release 13.0, V13.0.88
// Based on NVVM 7.0.1
//

.version 9.0
.target sm_75
.address_size 64

// .globl polynomial_kernel
.const .align 4 .b8 coefficients[16];

.visible .entry polynomial_kernel(
.param .u64 polynomial_kernel_param_0,
.param .u64 polynomial_kernel_param_1,
.param .u32 polynomial_kernel_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<12>;
.reg .b32 %r<6>;
.reg .b64 %rd<8>;


ld.param.u64 %rd1, [polynomial_kernel_param_0];
ld.param.u64 %rd2, [polynomial_kernel_param_1];
ld.param.u32 %r2, [polynomial_kernel_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r3, %r4, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra $L__BB0_2;

cvta.to.global.u64 %rd3, %rd2;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.const.f32 %f1, [coefficients+4];
ld.global.f32 %f2, [%rd5];
ld.const.f32 %f3, [coefficients];
fma.rn.f32 %f4, %f2, %f1, %f3;
ld.const.f32 %f5, [coefficients+8];
mul.f32 %f6, %f2, %f5;
fma.rn.f32 %f7, %f2, %f6, %f4;
ld.const.f32 %f8, [coefficients+12];
mul.f32 %f9, %f2, %f8;
mul.f32 %f10, %f2, %f9;
fma.rn.f32 %f11, %f2, %f10, %f7;
cvta.to.global.u64 %rd6, %rd1;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f11;

$L__BB0_2:
ret;

}

18 changes: 18 additions & 0 deletions src/driver/result.rs
Original file line number Diff line number Diff line change
Expand Up @@ -965,6 +965,24 @@ pub mod module {
Ok(func.assume_init())
}

/// Returns a pointer to a global/constant symbol in the module.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1gf3e43972c23c2d5c8a662f2d9a4d0c24)
///
/// # Safety
/// `module` must be a properly allocated and not freed module.
pub unsafe fn get_global(
module: sys::CUmodule,
name: CString,
) -> Result<(sys::CUdeviceptr, usize), DriverError> {
let name_ptr = name.as_c_str().as_ptr();
let mut dptr = MaybeUninit::uninit();
let mut bytes = MaybeUninit::uninit();
sys::cuModuleGetGlobal_v2(dptr.as_mut_ptr(), bytes.as_mut_ptr(), module, name_ptr)
.result()?;
Ok((dptr.assume_init(), bytes.assume_init()))
}

/// Unloads a module.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b)
Expand Down
32 changes: 32 additions & 0 deletions src/driver/safe/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1750,6 +1750,38 @@ impl CudaModule {
module: self.clone(),
})
}

/// Gets a global/constant symbol from the loaded module as a [CudaSlice<u8>].
///
/// This can be used to access `__constant__` memory declared in CUDA kernels.
/// The returned slice can be transmuted to the appropriate type via views.
///
/// # Example
///
/// ```ignore
/// // In CUDA: __constant__ float my_const[4];
/// let symbol = module.get_global("my_const", &stream)?;
/// let mut symbol_view = symbol.as_view_mut();
/// let mut symbol_f32 = unsafe { symbol_view.transmute_mut::<f32>(4).unwrap() };
/// stream.memcpy_htod(&[1.0f32, 2.0, 3.0, 4.0], &mut symbol_f32)?;
/// ```
pub fn get_global(
self: &Arc<Self>,
name: &str,
stream: &Arc<CudaStream>,
) -> Result<CudaSlice<u8>, DriverError> {
let name_c =
CString::new(name).map_err(|_| DriverError(sys::CUresult::CUDA_ERROR_INVALID_VALUE))?;
let (cu_device_ptr, bytes) = unsafe { result::module::get_global(self.cu_module, name_c) }?;
Ok(CudaSlice {
cu_device_ptr,
len: bytes,
read: None,
write: None,
stream: stream.clone(),
marker: PhantomData,
})
}
}

impl CudaFunction {
Expand Down
Loading