feat: add --use_python_runtime and --enable_cuda_graph args to the perf run script (#3397)

zewenli98 · web-flow · commit b43c4c231a30 · 2025-02-24T15:22:10.000-08:00
diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py
@@ -175,11 +175,17 @@ def run_ts_trt(model, input_tensors, params, precision, batch_size):
         "inputs": input_tensors,
         "enabled_precisions": {precision_to_dtype(precision)},
         "truncate_long_and_double": params.get("truncate", False),
+        "use_python_runtime": params.get("use_python_runtime", False),
     }
 
     if precision == "int8":
         compile_settings.update({"calib": params.get("calibration_cache")})
 
+    if params.get("enable_cuda_graph", False):
+        logging.warning(
+            f"Torchscript backend doesn't support CUDA Graphs. `--enable_cuda_graph` will be ignored."
+        )
+
     start_compile = timeit.default_timer()
     model = torchtrt.compile(model, ir="ts", **compile_settings)
     end_compile = timeit.default_timer()
@@ -217,19 +223,34 @@ def run_hf_dynamo(model, input_tensors, params, precision, batch_size):
         inputs=input_tensors,
         enabled_precisions={precision_to_dtype(precision)},
         truncate_double=params.get("truncate", False),
+        use_python_runtime=params.get("use_python_runtime", False),
     )
     end_compile = timeit.default_timer()
     compile_time_s = end_compile - start_compile
-    record_llm_perf(
-        trt_model,
-        "Dynamo",
-        input_tensors,
-        precision,
-        osl,
-        batch_size,
-        iters,
-        compile_time_s,
-    )
+
+    if params.get("enable_cuda_graph", False):
+        with torchtrt.runtime.enable_cudagraphs(trt_model) as cudagraphs_module:
+            record_llm_perf(
+                cudagraphs_module,
+                "Dynamo",
+                input_tensors,
+                precision,
+                osl,
+                batch_size,
+                iters,
+                compile_time_s,
+            )
+    else:
+        record_llm_perf(
+            trt_model,
+            "Dynamo",
+            input_tensors,
+            precision,
+            osl,
+            batch_size,
+            iters,
+            compile_time_s,
+        )
 
 
 @run_with_try_except
@@ -262,14 +283,27 @@ def run_dynamo(model, input_tensors, params, precision, batch_size):
         ),
         cache_built_engines=params.get("cache_built_engines", False),
         reuse_cached_engines=params.get("reuse_cached_engines", False),
+        use_python_runtime=params.get("use_python_runtime", False),
     )
     end_compile = timeit.default_timer()
     compile_time_s = end_compile - start_compile
     iters = params.get("iterations", 20)
 
-    record_perf(
-        model, "Dynamo", input_tensors, precision, iters, batch_size, compile_time_s
-    )
+    if params.get("enable_cuda_graph", False):
+        with torchtrt.runtime.enable_cudagraphs(model) as cudagraphs_module:
+            record_perf(
+                cudagraphs_module,
+                "Dynamo",
+                input_tensors,
+                precision,
+                iters,
+                batch_size,
+                compile_time_s,
+            )
+    else:
+        record_perf(
+            model, "Dynamo", input_tensors, precision, iters, batch_size, compile_time_s
+        )
 
 
 @run_with_try_except
@@ -292,6 +326,7 @@ def run_torch_compile(model, input_tensors, params, precision, batch_size):
         "enabled_precisions": {precision_to_dtype(precision)},
         "truncate": params.get("truncate", False),
         "min_block_size": params.get("min_block_size", 1),
+        "use_python_runtime": params.get("use_python_runtime", False),
     }
     start_compile = timeit.default_timer()
     model = torch.compile(model, backend="tensorrt", dynamic=None, options=compile_spec)
@@ -300,15 +335,27 @@ def run_torch_compile(model, input_tensors, params, precision, batch_size):
     compile_time_s = end_compile - start_compile
     iters = params.get("iterations", 20)
 
-    record_perf(
-        model,
-        "torch_compile",
-        input_tensors,
-        precision,
-        iters,
-        batch_size,
-        compile_time_s,
-    )
+    if params.get("enable_cuda_graph", False):
+        with torchtrt.runtime.enable_cudagraphs(model) as cudagraphs_module:
+            record_perf(
+                cudagraphs_module,
+                "torch_compile",
+                input_tensors,
+                precision,
+                iters,
+                batch_size,
+                compile_time_s,
+            )
+    else:
+        record_perf(
+            model,
+            "torch_compile",
+            input_tensors,
+            precision,
+            iters,
+            batch_size,
+            compile_time_s,
+        )
 
 
 @run_with_try_except
@@ -320,9 +367,13 @@ def run_hf_inductor(model, input_tensors, params, precision, batch_size):
     # Mark dynamic shapes for input sequence
     input_seq = input_tensors[0]
     torch._dynamo.mark_dynamic(input_seq, 1, min=1, max=osl)
+    mode = "max-autotune"
+    if params.get("enable_cuda_graph", False):
+        mode = "reduce-overhead"
+
     start_compile = timeit.default_timer()
     # Compile the model
-    model = torch.compile(model, backend="inductor", dynamic=None, mode="max-autotune")
+    model = torch.compile(model, backend="inductor", dynamic=None, mode=mode)
     model(input_seq)
     end_compile = timeit.default_timer()
     compile_time_s = end_compile - start_compile
@@ -356,15 +407,25 @@ def run_inductor(model, input_tensors, params, precision, batch_size):
     if params["is_text_llm"]:
         return run_hf_inductor(model, input_tensors, params, precision, batch_size)
 
+    mode = "max-autotune"
+    if params.get("enable_cuda_graph", False):
+        mode = "reduce-overhead"
+
     start_compile = timeit.default_timer()
-    model = torch.compile(model, backend="inductor", dynamic=None, mode="max-autotune")
+    model = torch.compile(model, backend="inductor", dynamic=None, mode=mode)
     model(*input_tensors)
     end_compile = timeit.default_timer()
     compile_time_s = end_compile - start_compile
     iters = params.get("iterations", 20)
 
     record_perf(
-        model, "inductor", input_tensors, precision, iters, batch_size, compile_time_s
+        model,
+        "inductor",
+        input_tensors,
+        precision,
+        iters,
+        batch_size,
+        compile_time_s,
     )
 
 
@@ -587,6 +648,16 @@ def run(
         action="store_true",
         help="Boolean flag to determine if the user provided model is a TRT engine or not",
     )
+    arg_parser.add_argument(
+        "--use_python_runtime",
+        action="store_true",
+        help="Whether to use Python runtime or not. Using C++ runtime by default",
+    )
+    arg_parser.add_argument(
+        "--enable_cuda_graph",
+        action="store_true",
+        help="Whether to enable CUDA Graph. It is not used by default",
+    )
     arg_parser.add_argument(
         "--report",
         type=str,