feat(runtime): add TensorRT-RTX native CUDA graph strategy to C++ runtime

tp5uiuc · tp5uiuc · commit 2b630e8973b3 · 2026-04-22T16:26:02.000Z
Wire cuda_graph_strategy into the C++ runtime and make the execute_engine
CUDA graph path TensorRT-RTX-aware. Fills in the apply_cuda_graph_strategy
stub and adds coexistence handling for outer whole-graph capture.

What
 - apply_cuda_graph_strategy() now calls IRuntimeConfig::setCudaGraphStrategy
   with either kDISABLED (default) or kWHOLE_GRAPH_CAPTURE. On RTX this
   hands capture/replay off to the TRT-RTX runtime, avoiding the lazy-kernel
   and dynamic-shape hazards of wrapping enqueueV3 in at::cuda::CUDAGraph.
 - is_monolithic_capturable(stream) returns whether an engine can safely
   be captured by an outer torch.cuda.CUDAGraph: RTX builds check
   IExecutionContext::isStreamCapturable and require a non-lazy kernel
   strategy; non-RTX builds always return true.
 - disable_rtx_native_cudagraphs() is a one-shot switch that turns off
   the engine internal capture and recreates the execution context so
   that outer stream captures contain the kernel launches directly.
 - execute_engine.cpp now computes effective_cudagraphs. On RTX, if a
   cuda_graph_strategy is set or SUBGRAPH cudagraphs is enabled, it
   bypasses the manual at::cuda::CUDAGraph path (the TRT-RTX runtime
   handles that inside enqueueV3). It also polls cudaStreamIsCapturing
   on the engine stream and, if an outer capture is already running,
   invokes disable_rtx_native_cudagraphs() so the outer capture proceeds
   without collision.

Why
 - On TRT-RTX, the manual at::cuda::CUDAGraph wrapper around enqueueV3
   can freeze fallback kernels in the captured graph (kLAZY specialisation
   would swap them later), and fails outright when the engine needs
   runtime allocation, DDS, control flow, or weight streaming.
 - Letting the TRT-RTX runtime own capture fixes both problems, and the
   outer-capture detection keeps the feature compatible with the
   existing CudaGraphsTorchTensorRTModule whole-graph wrapper without
   requiring it to know anything about RTX internals.

Tests
 - tests/py/dynamo/runtime/test_000_cuda_graph_strategy.py validates the
   setting default, both {disabled, whole_graph_capture} through the
   C++ runtime, the RTX-native override when set_cudagraphs_mode(True)
   is combined with a strategy, repeated inference correctness, and
   ValueError rejection of unknown strategy names.
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -552,6 +552,33 @@ void TRTEngine::set_resource_allocation_strategy(TRTEngine::ResourceAllocationSt
   }
 }
 
+bool TRTEngine::is_monolithic_capturable(cudaStream_t stream) const {
+#if defined(TRT_MAJOR_RTX) && defined(ENABLE_FEATURE_DISABLE_RUNTIME_ALLOCATION)
+  // "lazy" strategy (0) swaps specialized kernels in mid-run, which would invalidate a
+  // captured graph. Any other strategy (eager/none) combined with a capturable stream is
+  // safe for outer monolithic capture.
+  return exec_ctx->isStreamCapturable(stream) && dynamic_shapes_kernel_strategy != 0;
+#else
+  (void)stream;
+  return true;
+#endif
+}
+
+void TRTEngine::disable_rtx_native_cudagraphs() {
+#ifdef TRT_MAJOR_RTX
+  if (rtx_native_cudagraphs_disabled || cuda_graph_strategy == 0) {
+    return;
+  }
+  LOG_WARNING(
+      "Outer CUDA stream capture detected; disabling TRT-RTX native CUDA graph strategy on engine "
+      << name << " for the remainder of its lifetime.");
+  cuda_graph_strategy = 0;
+  apply_cuda_graph_strategy();
+  recreate_execution_context();
+  rtx_native_cudagraphs_disabled = true;
+#endif
+}
+
 void TRTEngine::recreate_execution_context() {
 #ifdef TRT_MAJOR_RTX
   if (!runtime_config) {
@@ -605,7 +632,12 @@ void TRTEngine::apply_dynamic_shapes_kernel_strategy() {
 }
 
 void TRTEngine::apply_cuda_graph_strategy() {
-  // Body added in a follow-up commit that wires the TRT-RTX native CUDA graph strategy.
+  bool ok = runtime_config->setCudaGraphStrategy(
+      cuda_graph_strategy == 1 ? nvinfer1::CudaGraphStrategy::kWHOLE_GRAPH_CAPTURE
+                               : nvinfer1::CudaGraphStrategy::kDISABLED);
+  if (!ok) {
+    LOG_WARNING("Failed to set CUDA graph strategy; continuing with default.");
+  }
 }
 
 void TRTEngine::load_runtime_cache() {
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -233,12 +233,24 @@ struct TRTEngine : torch::CustomClassHolder {
   std::string runtime_cache_path = "";
   int dynamic_shapes_kernel_strategy = 0; // 0=lazy, 1=eager, 2=none
   int cuda_graph_strategy = 0; // 0=disabled, 1=whole_graph_capture
+  // One-shot flag: set the first time execute_engine detects an outer stream capture around
+  // this engine, at which point its TRT-RTX native CUDA graph capture is turned off so the
+  // two do not fight. The flag stays set for the remainder of the engine's lifetime.
+  bool rtx_native_cudagraphs_disabled = false;
 
 #ifdef TRT_MAJOR_RTX
   std::shared_ptr<nvinfer1::IRuntimeConfig> runtime_config;
   std::shared_ptr<nvinfer1::IRuntimeCache> runtime_cache;
 #endif
 
+  // Monolithic-capturability check used when this engine is wrapped by an outer whole-graph
+  // capture (e.g. CudaGraphsTorchTensorRTModule). Non-RTX builds always return true.
+  bool is_monolithic_capturable(cudaStream_t stream) const;
+
+  // Disable TRT-RTX native CUDA graph capture on this engine (one-shot, invoked when an
+  // outer stream capture is detected around execute_engine). No-op on non-RTX.
+  void disable_rtx_native_cudagraphs();
+
  private:
   // Single entry point that (re)creates exec_ctx. On RTX builds this also creates / reuses
   // the IRuntimeConfig and applies all runtime config settings.
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -217,11 +217,29 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
   auto run_standard_execution = [&]() {
     bool cudagraphs_enabled = (CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS);
+    // effective_cudagraphs controls the manual at::cuda::CUDAGraph path below. On TRT-RTX
+    // builds we bypass that path whenever the engine has a cuda_graph_strategy set or the
+    // outer runtime has requested subgraph cudagraphs - the TRT-RTX runtime handles capture
+    // and replay internally inside enqueueV3. If an outer stream capture is already in
+    // progress (e.g. the caller wraps this module in CudaGraphsTorchTensorRTModule for
+    // whole-graph capture), RTX-native capture would conflict, so we disable it one-shot.
+    bool effective_cudagraphs = cudagraphs_enabled;
+#ifdef TRT_MAJOR_RTX
+    if (compiled_engine->cuda_graph_strategy != 0 || cudagraphs_enabled) {
+      effective_cudagraphs = false;
+      cudaStreamCaptureStatus capture_status;
+      cudaStreamIsCapturing(compiled_engine->engine_stream.stream(), &capture_status);
+      if (capture_status != cudaStreamCaptureStatusNone) {
+        compiled_engine->disable_rtx_native_cudagraphs();
+      }
+    }
+#endif
+
     bool shape_changed = _validate_shapes(inputs, compiled_engine);
 
     // Whether cudagraphs needs to record the graph on this pass
     auto result = compiled_engine->runtime_states.set_runtime_states(
-        cudagraphs_enabled, compiled_engine->use_pre_allocated_outputs, shape_changed);
+        effective_cudagraphs, compiled_engine->use_pre_allocated_outputs, shape_changed);
 
     bool need_cudagraphs_record = std::get<0>(result);
     bool can_use_pre_allocated_outputs = std::get<1>(result);
@@ -244,7 +262,8 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
             std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->input_profile_path);
       }
 
-      setup_input_tensors(inputs, compiled_engine, cudagraphs_enabled, need_cudagraphs_record, inputShapeTensorValues);
+      setup_input_tensors(
+          inputs, compiled_engine, effective_cudagraphs, need_cudagraphs_record, inputShapeTensorValues);
       // Check if input shapes can be inferred.
       int32_t const io_size{compiled_engine->cuda_engine->getNbIOTensors()};
       std::vector<char const*> names(io_size);
@@ -276,7 +295,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
           compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
         }
 
-        if (cudagraphs_enabled) {
+        if (effective_cudagraphs) {
           TORCHTRT_CHECK(
               compiled_engine->exec_ctx->setTensorAddress(
                   name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
@@ -316,8 +335,10 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       caller_exec_complete.record(compiled_engine->caller_stream);
       caller_exec_complete.block(compiled_engine->engine_stream);
 
-      if (!cudagraphs_enabled) {
-        // Direct execution uses the caller buffers directly
+      if (!effective_cudagraphs) {
+        // Direct execution uses the caller buffers directly. On TRT-RTX with a
+        // cuda_graph_strategy set, the engine captures/replays internally during
+        // this enqueueV3 call.
         compiled_engine->exec_ctx->enqueueV3(compiled_engine->engine_stream);
       } else {
         if (need_cudagraphs_record) {
@@ -350,7 +371,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     trt_exec_complete.record(compiled_engine->engine_stream);
     trt_exec_complete.block(compiled_engine->caller_stream);
 
-    if (cudagraphs_enabled) {
+    if (effective_cudagraphs) {
       // If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
       for (size_t o = 0; o < compiled_engine->output_buffers.size(); o++) {
         outputs[o].copy_(compiled_engine->output_buffers[o], false);
diff --git a/tests/py/dynamo/runtime/test_000_cuda_graph_strategy.py b/tests/py/dynamo/runtime/test_000_cuda_graph_strategy.py
@@ -0,0 +1,116 @@
+import unittest
+
+import torch
+import torch_tensorrt as torchtrt
+from torch.testing._internal.common_utils import TestCase, run_tests
+from torch_tensorrt._features import ENABLED_FEATURES
+from torch_tensorrt.dynamo._defaults import CUDA_GRAPH_STRATEGY
+from torch_tensorrt.dynamo._settings import CompilationSettings
+
+
+class CudaGraphModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(3, 8, 3, padding=1)
+
+    def forward(self, x):
+        return torch.relu(self.conv(x))
+
+
+def _compile_cpp(strategy):
+    model = CudaGraphModel().eval().cuda()
+    inputs = [torch.randn(2, 3, 16, 16).cuda()]
+    compiled = torchtrt.compile(
+        model,
+        ir="dynamo",
+        inputs=inputs,
+        enabled_precisions={torch.float32},
+        use_python_runtime=False,
+        min_block_size=1,
+        cuda_graph_strategy=strategy,
+    )
+    torch._dynamo.reset()
+    return compiled, inputs
+
+
+class TestCudaGraphStrategySettings(TestCase):
+    """Setting-level validation that runs on every build (RTX and non-RTX)."""
+
+    def test_default_value(self):
+        settings = CompilationSettings()
+        self.assertEqual(settings.cuda_graph_strategy, CUDA_GRAPH_STRATEGY)
+
+    def test_settable_values(self):
+        for value in ("disabled", "whole_graph_capture"):
+            settings = CompilationSettings(cuda_graph_strategy=value)
+            self.assertEqual(settings.cuda_graph_strategy, value)
+
+
+@unittest.skipIf(
+    not ENABLED_FEATURES.torch_tensorrt_runtime,
+    "C++ runtime is not available",
+)
+@unittest.skipIf(
+    not ENABLED_FEATURES.tensorrt_rtx,
+    "CUDA graph strategy is a TensorRT-RTX feature",
+)
+class TestCudaGraphStrategyCpp(TestCase):
+    """End-to-end: compile + infer through the C++ runtime with each strategy."""
+
+    def tearDown(self):
+        torchtrt.runtime.set_cudagraphs_mode(False)
+
+    def test_disabled(self):
+        compiled, inputs = _compile_cpp("disabled")
+        y = compiled(*[inp.clone() for inp in inputs])
+        self.assertEqual(tuple(y.shape), (2, 8, 16, 16))
+        self.assertTrue(torch.isfinite(y).all().item())
+
+    def test_whole_graph_capture(self):
+        compiled, inputs = _compile_cpp("whole_graph_capture")
+        y = compiled(*[inp.clone() for inp in inputs])
+        self.assertEqual(tuple(y.shape), (2, 8, 16, 16))
+        self.assertTrue(torch.isfinite(y).all().item())
+
+    def test_whole_graph_capture_with_subgraph_cudagraphs(self):
+        """Subgraph cudagraph mode + RTX strategy: RTX-native should take over without errors."""
+        compiled, inputs = _compile_cpp("whole_graph_capture")
+        torchtrt.runtime.set_cudagraphs_mode(True)
+        y = compiled(*[inp.clone() for inp in inputs])
+        self.assertEqual(tuple(y.shape), (2, 8, 16, 16))
+        self.assertTrue(torch.isfinite(y).all().item())
+
+    def test_repeated_inference(self):
+        """Repeated inference exercises the RTX-native capture/replay path."""
+        compiled, inputs = _compile_cpp("whole_graph_capture")
+        ref = compiled(*[inp.clone() for inp in inputs])
+        for _ in range(4):
+            out = compiled(*[inp.clone() for inp in inputs])
+            self.assertEqual(out.shape, ref.shape)
+            self.assertTrue(torch.isfinite(out).all().item())
+
+
+@unittest.skipIf(
+    not ENABLED_FEATURES.torch_tensorrt_runtime,
+    "C++ runtime is not available",
+)
+class TestCudaGraphStrategyInvalidValue(TestCase):
+    """Invalid strategy names are rejected at engine-packing time."""
+
+    def test_invalid_strategy_raises(self):
+        model = CudaGraphModel().eval().cuda()
+        inputs = [torch.randn(2, 3, 16, 16).cuda()]
+        with self.assertRaises((ValueError, RuntimeError)):
+            torchtrt.compile(
+                model,
+                ir="dynamo",
+                inputs=inputs,
+                enabled_precisions={torch.float32},
+                use_python_runtime=False,
+                min_block_size=1,
+                cuda_graph_strategy="not_a_real_strategy",
+            )
+
+
+if __name__ == "__main__":
+    run_tests()