TensorRT-RTX 1.2 Release

tp5uiuc · tp5uiuc · commit 4556b66c96c8 · 2025-11-10T23:06:27.000Z
diff --git a/demo/flux1.dev/flux_demo.ipynb b/demo/flux1.dev/flux_demo.ipynb
diff --git a/demo/flux1.dev/flux_demo.py b/demo/flux1.dev/flux_demo.py
@@ -62,6 +62,13 @@ def main():
         choices=["bf16", "fp8", "fp4"],
     )
     parser.add_argument("--enable-runtime-cache", action="store_true", help="Enable runtime caching")
+    parser.add_argument(
+        "--cuda-graph-strategy",
+        type=str,
+        default="disabled",
+        help="Cuda graph strategy (default: disabled)",
+        choices=["disabled", "whole_graph_capture"],
+    )
     parser.add_argument("--low-vram", action="store_true", help="Enable low VRAM mode")
     parser.add_argument("--dynamic-shape", action="store_true", default=False, help="Enable dynamic-shape engines")
     parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
@@ -83,6 +90,7 @@ def main():
             num_inference_steps=args.num_inference_steps,
             hf_token=args.hf_token,
             low_vram=args.low_vram,
+            cuda_graph_strategy=args.cuda_graph_strategy,
             enable_runtime_cache=args.enable_runtime_cache,
         )
 
@@ -99,6 +107,7 @@ def main():
         logger.info(f"Guidance scale: {args.guidance_scale}")
         logger.info(f"Cache directory: {args.cache_dir}")
         logger.info(f"Low VRAM mode: {args.low_vram}")
+        logger.info(f"Cudagraphs: {args.cuda_graph_strategy}")
         logger.info(f"Dynamic shape: {args.dynamic_shape}")
         logger.info(f"Runtime caching: {args.enable_runtime_cache}")
         logger.info(f"Cache mode: {args.cache_mode}")
diff --git a/demo/flux1.dev/pipelines/flux_pipeline.py b/demo/flux1.dev/pipelines/flux_pipeline.py
@@ -66,6 +66,7 @@ def __init__(
         low_vram: bool = False,
         log_level: str = "INFO",
         enable_runtime_cache: bool = False,
+        cuda_graph_strategy: str = "disabled",
     ):
         super().__init__(
             pipeline_name="flux_1_dev",
@@ -77,6 +78,7 @@ def __init__(
             low_vram=low_vram,
             log_level=log_level,
             enable_runtime_cache=enable_runtime_cache,
+            cuda_graph_strategy=cuda_graph_strategy,
         )
 
         # Flux-specific parameters
@@ -250,7 +252,7 @@ def build_and_load_engine(
             )
 
             if is_compatible:
-                engine = Engine(engine_path, precision, model_id, self.runtime_cache_path)
+                engine = Engine(engine_path, precision, model_id, self.runtime_cache_path, self.cuda_graph_strategy)
                 try:
                     if not self.low_vram:
                         engine.load()
@@ -285,7 +287,7 @@ def build_and_load_engine(
             )
 
             logger.debug(f"Building engine for path {engine_path}")
-            engine = Engine(engine_path, precision, model_id, self.runtime_cache_path)
+            engine = Engine(engine_path, precision, model_id, self.runtime_cache_path, self.cuda_graph_strategy)
             engine.build(
                 onnx_path=str(onnx_path),
                 input_profile=input_profile,
diff --git a/demo/tests/test_license_headers.py b/demo/tests/test_license_headers.py
@@ -54,7 +54,6 @@ def find_files_by_pattern(cls, root_path, patterns):
         # Directories to exclude from license header checks (only within the repository)
         exclude_dirs = {
             "build",
-            ".venv",
         }
 
         files = []
diff --git a/demo/utils/engine.py b/demo/utils/engine.py
@@ -82,17 +82,18 @@ def __init__(
         precision: str,
         model_name: str,
         runtime_cache_path: Optional[str] = None,
+        cuda_graph_strategy: str = "disabled",
     ):
         self.engine_path = engine_path
         self.engine = None
         self.context = None
         self.tensors = OrderedDict()
-        self.cuda_graph_instance = None
         self.precision = precision
         self.model_name = model_name
         self.runtime_config = None
         self.runtime_cache = None
         self.runtime_cache_path = runtime_cache_path
+        self.cuda_graph_strategy = cuda_graph_strategy
 
     def __del__(self):
         del self.tensors
@@ -154,7 +155,7 @@ def build(
         )
 
         # Build command with arguments
-        build_command = [f"polygraphy convert {onnx_path} --convert-to trt --output {self.engine_path}"]
+        build_command = [f"polygraphy convert {onnx_path} --convert-to trt --use-gpu --output {self.engine_path}"]
 
         build_args = []
         verbosity = "extra_verbose" if verbose else "error"
@@ -254,6 +255,10 @@ def activate(self, device_memory: Optional[int] = None, defer_memory_allocation:
         """Create execution context"""
 
         self.runtime_config = self.engine.create_runtime_config()
+
+        if self.cuda_graph_strategy == "whole_graph_capture":
+            self.runtime_config.cuda_graph_strategy = trt.CudaGraphStrategy.WHOLE_GRAPH_CAPTURE
+
         if self.runtime_cache_path:
             if self.runtime_cache is None:
                 logger.debug("Creating runtime cache")
@@ -383,7 +388,7 @@ def deallocate_buffers(self):
         gc.collect()
         torch.cuda.empty_cache()
 
-    def infer(self, feed_dict: dict[str, Any], stream: torch.cuda.Stream, use_cuda_graph: bool = False):
+    def infer(self, feed_dict: dict[str, Any], stream: torch.cuda.Stream):
         """Run inference with the engine"""
         # Copy input data to tensors
         for name, buf in feed_dict.items():
@@ -394,26 +399,8 @@ def infer(self, feed_dict: dict[str, Any], stream: torch.cuda.Stream, use_cuda_g
             self.context.set_tensor_address(name, tensor.data_ptr())
 
         # Execute inference
-        if use_cuda_graph:
-            if self.cuda_graph_instance is not None:
-                _CUASSERT(cudart.cudaGraphLaunch(self.cuda_graph_instance, stream))
-                _CUASSERT(cudart.cudaStreamSynchronize(stream))
-            else:
-                # Initial inference before CUDA graph capture
-                noerror = self.context.execute_async_v3(stream)
-                if not noerror:
-                    raise ValueError(f"ERROR: Inference with {self.engine_path} failed.")
-
-                # Capture CUDA graph
-                _CUASSERT(
-                    cudart.cudaStreamBeginCapture(stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal)
-                )
-                self.context.execute_async_v3(stream)
-                self.graph = _CUASSERT(cudart.cudaStreamEndCapture(stream))
-                self.cuda_graph_instance = _CUASSERT(cudart.cudaGraphInstantiate(self.graph, 0))
-        else:
-            noerror = self.context.execute_async_v3(stream)
-            if not noerror:
-                raise ValueError(f"ERROR: Inference with {self.engine_path} failed.")
+        noerror = self.context.execute_async_v3(stream)
+        if not noerror:
+            raise ValueError(f"ERROR: Inference with {self.engine_path} failed.")
 
         return self.tensors
diff --git a/demo/utils/pipeline.py b/demo/utils/pipeline.py
@@ -60,6 +60,7 @@ def __init__(
         low_vram: bool = False,
         log_level: str = "INFO",
         enable_runtime_cache: bool = False,
+        cuda_graph_strategy: str = "disabled",
     ):
         """
         Initialize pipeline.
@@ -75,6 +76,7 @@ def __init__(
             low_vram: Enable low VRAM mode
             log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
             enable_runtime_cache: Enable use of serialized runtime cache to improve JIT compilation times
+            cuda_graph_strategy: Enable use of Cudagraphs for accelerated inference (disabled, whole_graph_capture)
         """
         # Configure logging FIRST, before any other operations
         self.configure_logging(verbose, log_level)
@@ -89,6 +91,13 @@ def __init__(
         self.verbose = verbose
         self.hf_token = hf_token
         self.low_vram = low_vram
+        self.enable_runtime_cache = enable_runtime_cache
+
+        assert cuda_graph_strategy in ["disabled", "whole_graph_capture"], (
+            "Invalid cuda graph strategy {cuda_graph_strategy}, must be either 'disabled' or 'whole_graph_capture'"
+        )
+        logger.debug(f"Cuda graph strategy: {cuda_graph_strategy}")
+        self.cuda_graph_strategy = cuda_graph_strategy
 
         if enable_runtime_cache:
             self.runtime_cache_path = os.path.join(cache_dir, "runtime.cache")
@@ -276,7 +285,7 @@ def calculate_max_device_memory(self) -> int:
     def run_engine(self, model_name: str, inputs: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
         """Run inference on a specific engine"""
         engine = self.engines[model_name]
-        return engine.infer(inputs, self.stream, use_cuda_graph=False)
+        return engine.infer(inputs, self.stream)
 
     def infer(self, *args, **kwargs):
         """Run the full pipeline inference - to be implemented by subclasses"""
diff --git a/samples/apiUsage/cpp/apiUsage.cpp b/samples/apiUsage/cpp/apiUsage.cpp
@@ -564,6 +564,24 @@ int main()
     }
     useOptionalAdvancedDynamicShapesAPI(runtimeConfig.get(), inferenceEngine.get());
 
+    // Enable Cudagraphs Whole Graph Capture for accelerated inference
+    {
+        // TensorRT-RTX can record CUDA graphs to reduce kernel launch overhead during JIT inference.
+        // kDISABLED skips graph capture and runs kernels directly on the stream
+        // kWHOLE_GRAPH_CAPTURE captures the complete computational graph of the model
+        //    and executes it atomically on the GPU stream. It automatically handles dynamic shape
+        //    cases, capturing the CUDA graph after shape-specialized kernels are compiled for a given shape.
+        bool const setCudaGraphStrategySuccess
+            = runtimeConfig->setCudaGraphStrategy(nvinfer1::CudaGraphStrategy::kWHOLE_GRAPH_CAPTURE);
+        if (!setCudaGraphStrategySuccess)
+        {
+            std::cerr << "Failed to set cuda graph strategy!" << std::endl;
+            return EXIT_FAILURE;
+        }
+        // Query API to illustrate retrieval.
+        (void) runtimeConfig->getCudaGraphStrategy();
+    }
+
     // Create an engine execution context out of the deserialized engine.
     // TRT-RTX performs "Just-in-Time" (JIT) optimization here, targeting the current GPU.
     // JIT phase is faster than AOT phase, and typically completes in under 15 seconds.
diff --git a/samples/apiUsage/python/api_usage.py b/samples/apiUsage/python/api_usage.py
@@ -364,6 +364,14 @@ def run_inference(serialized_engine: trt.IHostMemory, fc1_weights: trt.Weights,
 
     use_optional_advanced_dynamic_shapes_api(runtime_config, inference_engine)
 
+    # Enable Cudagraphs Whole Graph Capture for accelerated inference
+    # TensorRT-RTX can record CUDA graphs to reduce kernel launch overhead during JIT inference.
+    # DISABLED skips graph capture and runs kernels directly on the stream
+    # WHOLE_GRAPH_CAPTURE captures the complete computational graph of the model
+    #    and executes it atomically on the GPU stream. It automatically handles dynamic shape
+    #    cases, capturing the CUDA graph after shape-specialized kernels are compiled for a given shape.
+    runtime_config.cuda_graph_strategy = trt.CudaGraphStrategy.WHOLE_GRAPH_CAPTURE
+
     # Create an engine execution context out of the deserialized engine.
     # TRT-RTX performs "Just-in-Time" (JIT) optimization here, targeting the current GPU.
     # JIT phase is faster than AOT phase, and typically completes in under 15 seconds.
diff --git a/samples/cmake/modules/get_version.cmake b/samples/cmake/modules/get_version.cmake
@@ -43,14 +43,20 @@ function(get_version include_dir version_variable soversion_variable)
   endif()
 
   foreach(type MAJOR MINOR PATCH)
-    string(REGEX MATCH "TRT_${type}_RTX [0-9]+" TRT_TYPE_STRING ${VERSION_STRINGS})
-    string(REGEX MATCH "[0-9]+" TRT_${type} ${TRT_TYPE_STRING})
-    if(NOT TRT_${type})
+    set(trt_${type} "")
+    foreach(version_line ${VERSION_STRINGS})
+      string(REGEX MATCH "TRT_${type}_RTX [0-9]+" trt_type_string "${version_line}")
+      if(trt_type_string)
+        string(REGEX MATCH "[0-9]+" trt_${type} "${trt_type_string}")
+        break()
+      endif()
+    endforeach()
+    if(NOT DEFINED trt_${type})
       message(FATAL_ERROR "Failed to extract TRT_${type}_RTX from ${header_file}")
     endif()
   endforeach(type)
-  set(${version_variable} ${TRT_MAJOR}.${TRT_MINOR}.${TRT_PATCH} PARENT_SCOPE)
-  set(${soversion_variable} ${TRT_MAJOR}_${TRT_MINOR} PARENT_SCOPE)
+  set(${version_variable} ${trt_MAJOR}.${trt_MINOR}.${trt_PATCH} PARENT_SCOPE)
+  set(${soversion_variable} ${trt_MAJOR}_${trt_MINOR} PARENT_SCOPE)
 endfunction()
 
 # -----------------------------------------------------------------------------

Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,6 @@ def find_files_by_pattern(cls, root_path, patterns):`
`54`	`54`	`# Directories to exclude from license header checks (only within the repository)`
`55`	`55`	`exclude_dirs = {`
`56`	`56`	`"build",`
`57`		`- ".venv",`
`58`	`57`	`}`
`59`	`58`
`60`	`59`	`files = []`