Changed the stream of python runtime to default stream

cehongwang · cehongwang · commit 1c0a8aa30432 · 2025-10-13T04:49:48.000Z
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -171,8 +171,7 @@ def __init__(
         self._input_buffers: List[torch.Tensor] = []
         self._output_buffers: List[torch.Tensor] = []
         self.cudagraph: Optional[torch.cuda.CUDAGraph] = None
-        self._caller_stream: Optional[torch.cuda.Stream] = None
-        self._engine_stream: Optional[torch.cuda.Stream] = None
+        self._engine_stream: torch.cuda.Stream = torch.cuda.current_stream()
         self.output_tensors: Optional[List[torch.Tensor]] = None
         self.sync_stream = True
 
@@ -287,13 +286,7 @@ def setup_engine(self) -> None:
         ), f"TensorRT engine was not built to target current platform (target: {self.target_platform}, current: {Platform.current_platform()})"
         # Stream handling: if the caller stream is the pytorch default stream, create a new engine stream
         # otherwise, use the caller stream and disable stream synchronization
-        self._caller_stream = torch.cuda.current_stream()
-        if self._caller_stream == torch.cuda.default_stream():
-            self._engine_stream = torch.cuda.Stream()
-            self.sync_stream = True
-        else:
-            self._engine_stream = self._caller_stream
-            self.sync_stream = False
+        self._engine_stream = torch.cuda.current_stream()
 
         self.initialized = True
         runtime = trt.Runtime(TRT_LOGGER)
@@ -559,9 +552,6 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                 else nullcontext()
             ):
 
-                if self.sync_stream:
-                    self._engine_stream.wait_stream(self._caller_stream)
-
                 if self.cudagraphs_enabled:
                     if need_cudagraphs_record:
                         self.cudagraph = torch.cuda.CUDAGraph()
@@ -587,10 +577,16 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                     self.cudagraph.replay()  # type: ignore
 
                 else:
-                    self.context.execute_async_v3(self._engine_stream.cuda_stream)
+                    import warnings
 
-                if self.sync_stream:
-                    self._caller_stream.wait_stream(self._engine_stream)
+                    with warnings.catch_warnings():
+                        try:
+                            self.context.execute_async_v3(
+                                self._engine_stream.cuda_stream
+                            )
+                        except Warning as e:
+                            breakpoint()
+                            print("warning ignored")
 
             if self.use_pre_allocated_outputs:
                 self.pre_allocated_outputs = self.create_output_tensors()
@@ -645,22 +641,12 @@ def run_output_allocator() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                 if self.profiling_enabled
                 else nullcontext()
             ):
-                self._caller_stream = torch.cuda.current_stream()
-                if (
-                    self._engine_stream == torch.cuda.default_stream()
-                    or self._engine_stream is None
-                ):
-                    self._engine_stream = torch.cuda.Stream()
-
-                self._engine_stream.wait_stream(self._caller_stream)
 
                 with torch.cuda.stream(self._engine_stream):
                     self.context.execute_async_v3(
                         self._engine_stream.cuda_stream
                     )  # The OutputAllocator is called by execute_async_v3()
 
-                self._caller_stream.wait_stream(self._engine_stream)
-
             with (
                 torch.autograd.profiler.record_function(
                     "PythonTorchTensorRTModule:ProcessOutputs"