Surface CUDA launch errors

shi-eric · shi-eric · commit b2ec684041c1 · 2026-06-08T05:32:35.000Z
CUDA launch failures were previously written to native stderr but
ignored by the Python launch callers. That let simulations and tape
backward replay continue with stale outputs or gradients after CUDA
rejected a launch.

Check the existing wp_cuda_launch_kernel return value in direct
launches, recorded Launch replay, JAX FFI, and APIC loaded-graph replay.
This keeps the hot path to a single branch and avoids adding
synchronization or extra CUDA queries.

Add CUDA regressions for oversized block dimensions, launch_bounds
violations, recorded commands, adjoint launches, and Tape.backward, and
record the user-facing fix in the changelog.

Signed-off-by: Eric Shi &lt;ershi@nvidia.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -54,6 +54,8 @@
   instead of an internal `AttributeError` ([GH-1487](https://github.com/NVIDIA/warp/issues/1487)).
 - Fix stale gradient keepalive references when replacing a `@wp.struct` plain-array field with `None` or a
   non-gradient array ([GH-1520](https://github.com/NVIDIA/warp/issues/1520)).
+- Fix CUDA kernel launch failures to raise a Python `RuntimeError` instead of only logging native CUDA stderr and
+  continuing with stale outputs or gradients ([GH-1535](https://github.com/NVIDIA/warp/issues/1535)).
 
 ### Documentation
 
diff --git a/warp/_src/context.py b/warp/_src/context.py
@@ -8580,6 +8580,10 @@ def invoke(kernel, hooks, params: Sequence[Any], adjoint: bool):
         hooks.backward(ctypes.byref(params[0]), ctypes.byref(args), ctypes.byref(adj_args))
 
 
+def _raise_cuda_launch_error(kernel: Kernel, device: Device) -> None:
+    raise RuntimeError(f"Error launching kernel: {kernel.key} on device {device}: {runtime.get_error_string()}")
+
+
 class Launch:
     """Represent all data required for a kernel launch so that launches can be replayed quickly.
 
@@ -8786,7 +8790,7 @@ def launch(self, stream: Stream | None = None) -> None:
                     graph._retain_module_exec(self.module_exec)
 
             if self.adjoint:
-                runtime.core.wp_cuda_launch_kernel(
+                if runtime.core.wp_cuda_launch_kernel(
                     self.device.context,
                     self.hooks.backward,
                     self.bounds.size,
@@ -8796,9 +8800,10 @@ def launch(self, stream: Stream | None = None) -> None:
                     self.params_addr,
                     stream.cuda_stream,
                     None,  # apic_info: replayed launches don't re-record
-                )
+                ):
+                    _raise_cuda_launch_error(self.kernel, self.device)
             else:
-                runtime.core.wp_cuda_launch_kernel(
+                if runtime.core.wp_cuda_launch_kernel(
                     self.device.context,
                     self.hooks.forward,
                     self.bounds.size,
@@ -8808,7 +8813,8 @@ def launch(self, stream: Stream | None = None) -> None:
                     self.params_addr,
                     stream.cuda_stream,
                     None,  # apic_info: replayed launches don't re-record
-                )
+                ):
+                    _raise_cuda_launch_error(self.kernel, self.device)
 
 
 def _canonicalize_dim(dim: int | Sequence[int]) -> tuple[int, ...]:
@@ -9127,7 +9133,7 @@ def pack_args(args, params, adjoint=False):
                             "Backward kernel launches are not supported during APIC graph capture. "
                             "Use wp.Tape outside of capture scope instead."
                         )
-                    runtime.core.wp_cuda_launch_kernel(
+                    if runtime.core.wp_cuda_launch_kernel(
                         device.context,
                         hooks.backward,
                         bounds.size,
@@ -9137,7 +9143,8 @@ def pack_args(args, params, adjoint=False):
                         kernel_params,
                         stream.cuda_stream,
                         None,
-                    )
+                    ):
+                        _raise_cuda_launch_error(kernel, device)
 
             else:
                 if hooks.forward is None:
@@ -9170,7 +9177,7 @@ def pack_args(args, params, adjoint=False):
                             False,
                         )
                         apic_info_ptr = ctypes.byref(apic_info)
-                    runtime.core.wp_cuda_launch_kernel(
+                    if runtime.core.wp_cuda_launch_kernel(
                         device.context,
                         hooks.forward,
                         bounds.size,
@@ -9180,7 +9187,8 @@ def pack_args(args, params, adjoint=False):
                         kernel_params,
                         stream.cuda_stream,
                         apic_info_ptr,
-                    )
+                    ):
+                        _raise_cuda_launch_error(kernel, device)
 
             try:
                 runtime.verify_cuda_device(device)
diff --git a/warp/_src/jax/ffi.py b/warp/_src/jax/ffi.py
@@ -451,7 +451,7 @@ def ffi_callback(self, call_frame):
                 assert hooks.forward, "Failed to find kernel entry point"
 
                 # launch the kernel
-                wp._src.context.runtime.core.wp_cuda_launch_kernel(
+                if wp._src.context.runtime.core.wp_cuda_launch_kernel(
                     device.context,
                     hooks.forward,
                     launch_bounds.size,
@@ -461,7 +461,11 @@ def ffi_callback(self, call_frame):
                     kernel_params,
                     stream,
                     None,  # apic_info
-                )
+                ):
+                    raise RuntimeError(
+                        f"Error launching kernel: {self.kernel.key} on device {device}: "
+                        f"{wp._src.context.runtime.get_error_string()}"
+                    )
 
         except Exception as e:
             print(traceback.format_exc())
diff --git a/warp/native/apic.cu b/warp/native/apic.cu
@@ -372,13 +372,17 @@ static bool apic_rebuild_cuda_graph(APICGraph* graph, CUstream stream)
             // Replay via the same wp_cuda_launch_kernel that captured this op.
             // apic_info=nullptr is safe: g_apic_state is null during replay, so
             // the recording branch in wp_cuda_launch_kernel is a no-op.
-            wp_cuda_launch_kernel(
+            size_t launch_result = wp_cuda_launch_kernel(
                 graph->cuda_context, kernel, rec->dim, rec->max_blocks, rec->block_dim, rec->smem_bytes, args.data(),
                 stream, /*apic_info=*/nullptr
             );
 
             for (uint8_t* p : arg_storage)
                 delete[] p;
+            if (launch_result) {
+                success = false;
+                break;
+            }
             break;
         }
 
diff --git a/warp/tests/test_launch.py b/warp/tests/test_launch.py
@@ -48,6 +48,11 @@ def square_kernel(input: wp.array(dtype=float), output: wp.array(dtype=float)):
     output[i] = input[i] * input[i]
 
 
+@wp.kernel
+def noop_kernel():
+    tid = wp.tid()
+
+
 def test1d(test, device):
     a = np.arange(0, dim_x).reshape(dim_x)
 
@@ -400,6 +405,12 @@ def kernel_single_tuple_bound(x: wp.array(dtype=float)):
     x[tid] = x[tid] * 2.0
 
 
+@wp.kernel(launch_bounds=256)
+def bounded_square_kernel(input: wp.array(dtype=float), output: wp.array(dtype=float)):
+    i = wp.tid()
+    output[i] = input[i] * input[i]
+
+
 def test_launch_bounds_none(test, device):
     """Test kernel without launch_bounds"""
     n = 1024
@@ -436,7 +447,61 @@ def test_launch_bounds_single_tuple(test, device):
     assert_np_equal(x.numpy(), np.full(n, 2.0, dtype=np.float32))
 
 
+def test_launch_device_block_dim_failure(test, device):
+    """Raise when CUDA rejects an oversized launch block.
+
+    Protects users from continuing after native stderr with kernel outputs left unchanged.
+    """
+    with assert_cuda_launch_error(test, r"Error launching kernel: .*noop_kernel.*Warp CUDA error"):
+        wp.launch(noop_kernel, dim=1, block_dim=2048, device=device)
+
+
+def test_launch_bounds_block_dim_failure(test, device):
+    """Raise when CUDA rejects a launch-bounds violation.
+
+    Protects users from silently skipping kernels whose outputs feed later simulation stages.
+    """
+    x = wp.ones(1, dtype=float, device=device)
+
+    with assert_cuda_launch_error(test, r"Error launching kernel: .*kernel_single_bound.*Warp CUDA error"):
+        wp.launch(kernel_single_bound, dim=1, inputs=[x], block_dim=512, device=device)
+
+
+def test_launch_cmd_block_dim_failure(test, device):
+    """Raise when recorded launches hit CUDA launch errors.
+
+    Protects recorded command replay from returning normally with stale outputs.
+    """
+    x = wp.ones(1, dtype=float, device=device)
+    cmd = wp.launch(kernel_single_bound, dim=1, inputs=[x], block_dim=512, device=device, record_cmd=True)
+
+    with assert_cuda_launch_error(test, r"Error launching kernel: .*kernel_single_bound.*Warp CUDA error"):
+        cmd.launch()
+
+
+def test_launch_adjoint_block_dim_failure(test, device):
+    """Raise when adjoint launches hit CUDA launch errors.
+
+    Protects differentiable simulations from using missing or partial gradients.
+    """
+    input_arr = wp.array([1.0], dtype=float, requires_grad=True, device=device)
+    output_arr = wp.empty_like(input_arr)
+    output_arr.grad.fill_(1.0)
+
+    with assert_cuda_launch_error(test, r"Error launching kernel: .*bounded_square_kernel.*Warp CUDA error"):
+        wp.launch(
+            bounded_square_kernel,
+            dim=input_arr.size,
+            inputs=[input_arr, output_arr],
+            adj_inputs=[None, None],
+            adjoint=True,
+            block_dim=512,
+            device=device,
+        )
+
+
 devices = get_test_devices()
+cuda_devices = get_cuda_test_devices()
 
 
 class TestLaunch(unittest.TestCase):
@@ -462,6 +527,18 @@ class TestLaunch(unittest.TestCase):
 add_function_test(TestLaunch, "test_launch_bounds_single", test_launch_bounds_single, devices=devices)
 add_function_test(TestLaunch, "test_launch_bounds_tuple", test_launch_bounds_tuple, devices=devices)
 add_function_test(TestLaunch, "test_launch_bounds_single_tuple", test_launch_bounds_single_tuple, devices=devices)
+add_function_test(
+    TestLaunch, "test_launch_device_block_dim_failure", test_launch_device_block_dim_failure, devices=cuda_devices
+)
+add_function_test(
+    TestLaunch, "test_launch_bounds_block_dim_failure", test_launch_bounds_block_dim_failure, devices=cuda_devices
+)
+add_function_test(
+    TestLaunch, "test_launch_cmd_block_dim_failure", test_launch_cmd_block_dim_failure, devices=cuda_devices
+)
+add_function_test(
+    TestLaunch, "test_launch_adjoint_block_dim_failure", test_launch_adjoint_block_dim_failure, devices=cuda_devices
+)
 
 
 if __name__ == "__main__":
diff --git a/warp/tests/test_tape.py b/warp/tests/test_tape.py
@@ -303,7 +303,27 @@ def test_tape_visualize_subscript(test, device):
     test.assertIn("array: dtype=", dot_code)
 
 
+def test_tape_backward_cuda_launch_failure(test, device):
+    """Raise when Tape backward hits CUDA launch errors.
+
+    Corrupt the recorded backward launch block size to reproduce the stale-gradient failure mode.
+    Protects ``Tape.backward()`` from returning after a failed replay with stale or missing gradients.
+    """
+    x = wp.array([1.0], dtype=wp.float32, device=device, requires_grad=True)
+    y = wp.empty_like(x, requires_grad=True)
+
+    tape = wp.Tape()
+    with tape:
+        wp.launch(kernel=mul_constant, dim=x.size, inputs=[x], outputs=[y], block_dim=256, device=device)
+
+    tape.launches[0][6] = 2048
+
+    with assert_cuda_launch_error(test, r"Error launching kernel: .*mul_constant.*Warp CUDA error"):
+        tape.backward(grads={y: wp.ones_like(y)})
+
+
 devices = get_test_devices()
+cuda_devices = get_cuda_test_devices()
 
 
 class TestTape(unittest.TestCase):
@@ -367,6 +387,9 @@ def test_tape_empty_nested_scope_markers_removed(self):
 add_function_test(TestTape, "test_tape_struct_subscript", test_tape_struct_subscript, devices=devices)
 add_function_test(TestTape, "test_tape_nested_struct_subscript", test_tape_nested_struct_subscript, devices=devices)
 add_function_test(TestTape, "test_tape_visualize_subscript", test_tape_visualize_subscript, devices=devices)
+add_function_test(
+    TestTape, "test_tape_backward_cuda_launch_failure", test_tape_backward_cuda_launch_failure, devices=cuda_devices
+)
 
 
 if __name__ == "__main__":
diff --git a/warp/tests/unittest_utils.py b/warp/tests/unittest_utils.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+import contextlib
 import ctypes
 import ctypes.util
 import importlib.util
@@ -147,6 +148,19 @@ def get_cuda_test_devices(mode=None):
     return [d for d in devices if d.is_cuda]
 
 
+@contextlib.contextmanager
+def assert_cuda_launch_error(test, pattern=r"Warp CUDA error"):
+    """Assert a CUDA launch raises while suppressing expected native stderr."""
+    wp.init()
+    saved_error_output_enabled = wp._src.context.runtime.core.wp_is_error_output_enabled()
+    wp._src.context.runtime.core.wp_set_error_output_enabled(False)
+    try:
+        with test.assertRaisesRegex(RuntimeError, pattern):
+            yield
+    finally:
+        wp._src.context.runtime.core.wp_set_error_output_enabled(saved_error_output_enabled)
+
+
 def get_cuda_device_pair_with_peer_access_support(devices=None):
     """Return the first CUDA pair where ``peer_device`` can access ``target_device`` allocations."""