Skip to content

Commit cb80dbd

Browse files
cognectclaude
andcommitted
feat(cuda+py): Phase 1.5++ — NVRTC spawn-path + compile/launch
atomr-accel-cuda: ContextActor now spawns NvrtcActor when feature `nvrtc` is on AND EnabledLibraries::NVRTC is set. KernelChildren gains a typed `nvrtc: Option<ActorRef<NvrtcMsg>>` field (mirrors the cuSOLVER spawn-path from 3e40c03). atomr-accel-py: Device.compile_kernel(name, src, ...) returns an NvrtcKernel; NvrtcKernel.launch(grid, block, args, shared) dispatches typed KernelArg payloads. KernelArg is a one-shot Python wrapper with constructors for scalar f32/f64/i32/i64/u32/u64 and device-pointer buffers across every supported GpuBuffer* dtype (f32/f64/i32/u32/u8). The NVRTC message variants are NvrtcMsg::Compile { src, kernel_name, opts, reply } and NvrtcMsg::Launch { kernel, args, cfg, reply } — matched against the actor source. NvrtcOpts is left at default for now; Phase-5 builder surface (LTO, --std=c++17, SmArch, name expressions) is a follow-up. Refs: #1 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 23a99b5 commit cb80dbd

7 files changed

Lines changed: 562 additions & 14 deletions

File tree

crates/atomr-accel-cuda/src/device/context_actor.rs

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,20 @@ impl ContextActor {
215215
children.solver = Some(solver_stub);
216216
}
217217
}
218+
#[cfg(feature = "nvrtc")]
219+
{
220+
if self.config.enabled_libraries.contains(EnabledLibraries::NVRTC) {
221+
let nvrtc_stub = ctx
222+
.spawn::<crate::kernel::NvrtcActor>(
223+
crate::kernel::NvrtcActor::mock_props(),
224+
"nvrtc",
225+
)
226+
.unwrap_or_else(|e| {
227+
panic!("Unrecoverable: spawn mock NvrtcActor: {e}")
228+
});
229+
children.nvrtc = Some(nvrtc_stub);
230+
}
231+
}
218232
self.children = Some(children.clone());
219233
self.parent.tell(DeviceMsg::ContextReady { children });
220234
info!(device_id, "ContextActor (mock) ready");
@@ -350,6 +364,24 @@ impl ContextActor {
350364
None
351365
};
352366

367+
#[cfg(feature = "nvrtc")]
368+
let nvrtc_ref = if libs.contains(EnabledLibraries::NVRTC) {
369+
let s = allocator.acquire(Default::default());
370+
let props = crate::kernel::NvrtcActor::props(
371+
s,
372+
allocator.clone(),
373+
self.completion.clone(),
374+
self.state.clone(),
375+
cuda_ctx.clone(),
376+
);
377+
Some(
378+
ctx.spawn::<crate::kernel::NvrtcActor>(props, "nvrtc")
379+
.unwrap_or_else(|e| panic!("Unrecoverable: spawn NvrtcActor: {e}")),
380+
)
381+
} else {
382+
None
383+
};
384+
353385
#[allow(unused_mut)]
354386
let mut children = KernelChildren::new(blas_ref);
355387
#[cfg(feature = "cudnn")]
@@ -368,6 +400,10 @@ impl ContextActor {
368400
{
369401
children.solver = solver_ref;
370402
}
403+
#[cfg(feature = "nvrtc")]
404+
{
405+
children.nvrtc = nvrtc_ref;
406+
}
371407
self.children = Some(children.clone());
372408
self.parent.tell(DeviceMsg::ContextReady { children });
373409
info!(

crates/atomr-accel-cuda/src/device/device_actor.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,8 @@ pub struct KernelChildren {
337337
pub rng: Option<ActorRef<crate::kernel::RngMsg>>,
338338
#[cfg(feature = "cusolver")]
339339
pub solver: Option<ActorRef<crate::kernel::SolverMsg>>,
340+
#[cfg(feature = "nvrtc")]
341+
pub nvrtc: Option<ActorRef<crate::kernel::NvrtcMsg>>,
340342
/// TypeId-keyed registry for child actors not represented by a
341343
/// typed field above. The `Arc<RwLock<…>>` keeps `KernelChildren`
342344
/// `Clone` while letting later library crates register / look up
@@ -359,6 +361,8 @@ impl KernelChildren {
359361
rng: None,
360362
#[cfg(feature = "cusolver")]
361363
solver: None,
364+
#[cfg(feature = "nvrtc")]
365+
nvrtc: None,
362366
extras: Arc::new(RwLock::new(HashMap::new())),
363367
}
364368
}

crates/atomr-accel-py/python/atomr_accel/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ def _try_import(name):
9797
Solver = _try_import("Solver")
9898
Collective = _try_import("Collective")
9999
NvrtcKernel = _try_import("NvrtcKernel")
100+
KernelArg = _try_import("KernelArg")
100101

101102
# Phase 1.5 — IPC handles (cfg cuda-ipc)
102103
IpcMemHandle = _try_import("IpcMemHandle")
@@ -150,6 +151,7 @@ def _try_import(name):
150151
"Solver",
151152
"Collective",
152153
"NvrtcKernel",
154+
"KernelArg",
153155
# Phase 2 — patterns
154156
"DynamicBatchingServer",
155157
"InferenceCascade",
Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,20 @@
1-
"""``atomr_accel.nvrtc`` — NvrtcKernel handle.
1+
"""``atomr_accel.nvrtc`` — NVRTC compile + launch surface.
22
3-
Phase 1 keeps the existing ``NvrtcKernel`` stub (``name``, ``generation``).
4-
The device-side ``compile_kernel`` and ``launch`` paths require
5-
``SnapshotChildren`` plumbing plus typed ``KernelArg`` marshalling; both
6-
follow in the Phase 1.5 NVRTC tracking issue.
3+
Phase 1.5++ wires the full path:
74
8-
On builds without NVRTC, ``NvrtcKernel`` is ``None``.
5+
* ``Device.compile_kernel(name, src, ...)`` returns an
6+
:class:`NvrtcKernel`.
7+
* :class:`NvrtcKernel.launch(grid, block, args, ...)` dispatches
8+
typed :class:`KernelArg` payloads (scalar f32/f64/i32/i64/u32/u64
9+
plus device-pointer wrappers around every supported ``GpuBuffer*``).
10+
11+
On builds without NVRTC, ``NvrtcKernel`` and ``KernelArg`` are ``None``.
912
"""
1013

1114
try:
12-
from ._native import NvrtcKernel
15+
from ._native import NvrtcKernel, KernelArg
1316
except ImportError:
1417
NvrtcKernel = None # type: ignore[assignment]
18+
KernelArg = None # type: ignore[assignment]
1519

16-
__all__ = ["NvrtcKernel"]
20+
__all__ = ["NvrtcKernel", "KernelArg"]

crates/atomr-accel-py/src/device.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,14 @@ impl PyDevice {
389389
{
390390
dict.set_item("cusolver", false)?;
391391
}
392+
#[cfg(feature = "nvrtc")]
393+
{
394+
dict.set_item("nvrtc", children.nvrtc.is_some())?;
395+
}
396+
#[cfg(not(feature = "nvrtc"))]
397+
{
398+
dict.set_item("nvrtc", false)?;
399+
}
392400
dict.set_item("extras", children.extras_len())?;
393401
}
394402
None => {
@@ -397,6 +405,7 @@ impl PyDevice {
397405
dict.set_item("cufft", false)?;
398406
dict.set_item("curand", false)?;
399407
dict.set_item("cusolver", false)?;
408+
dict.set_item("nvrtc", false)?;
400409
dict.set_item("extras", 0usize)?;
401410
dict.set_item("ready", false)?;
402411
}
@@ -488,6 +497,31 @@ impl PyDevice {
488497
Py::new(py, crate::solver::PySolver::new(h))
489498
}
490499

500+
/// JIT-compile a CUDA C++ kernel via NVRTC. Returns an
501+
/// [`crate::nvrtc::PyNvrtcKernel`] which can be launched via
502+
/// `kernel.launch(grid, block, args, ...)`. Requires the `nvrtc`
503+
/// cargo feature at build time *and* `EnabledLibraries::NVRTC` on
504+
/// this device. In mock mode the actor is spawned but every
505+
/// compile returns `Unrecoverable("NvrtcActor in mock mode")`.
506+
#[cfg(feature = "nvrtc")]
507+
#[pyo3(signature = (name, src, timeout_secs=60.0))]
508+
fn compile_kernel(
509+
&self,
510+
py: Python<'_>,
511+
name: String,
512+
src: String,
513+
timeout_secs: f64,
514+
) -> PyResult<Py<crate::nvrtc::PyNvrtcKernel>> {
515+
let kc = self
516+
.snapshot_children(py, timeout_secs)?
517+
.ok_or_else(|| errors::map_str("device children not ready"))?;
518+
let actor = kc
519+
.nvrtc
520+
.clone()
521+
.ok_or_else(|| errors::map_str("NVRTC actor not enabled on this device"))?;
522+
crate::nvrtc::compile_via_actor(py, actor, name, src, timeout_secs)
523+
}
524+
491525
fn __repr__(&self) -> String {
492526
format!("Device(id={})", self.device_id)
493527
}

0 commit comments

Comments
 (0)