@@ -175,11 +175,17 @@ def run_ts_trt(model, input_tensors, params, precision, batch_size):
175
175
"inputs" : input_tensors ,
176
176
"enabled_precisions" : {precision_to_dtype (precision )},
177
177
"truncate_long_and_double" : params .get ("truncate" , False ),
178
+ "use_python_runtime" : params .get ("use_python_runtime" , False ),
178
179
}
179
180
180
181
if precision == "int8" :
181
182
compile_settings .update ({"calib" : params .get ("calibration_cache" )})
182
183
184
+ if params .get ("enable_cuda_graph" , False ):
185
+ logging .warning (
186
+ f"Torchscript backend doesn't support CUDA Graphs. `--enable_cuda_graph` will be ignored."
187
+ )
188
+
183
189
start_compile = timeit .default_timer ()
184
190
model = torchtrt .compile (model , ir = "ts" , ** compile_settings )
185
191
end_compile = timeit .default_timer ()
@@ -217,19 +223,34 @@ def run_hf_dynamo(model, input_tensors, params, precision, batch_size):
217
223
inputs = input_tensors ,
218
224
enabled_precisions = {precision_to_dtype (precision )},
219
225
truncate_double = params .get ("truncate" , False ),
226
+ use_python_runtime = params .get ("use_python_runtime" , False ),
220
227
)
221
228
end_compile = timeit .default_timer ()
222
229
compile_time_s = end_compile - start_compile
223
- record_llm_perf (
224
- trt_model ,
225
- "Dynamo" ,
226
- input_tensors ,
227
- precision ,
228
- osl ,
229
- batch_size ,
230
- iters ,
231
- compile_time_s ,
232
- )
230
+
231
+ if params .get ("enable_cuda_graph" , False ):
232
+ with torchtrt .runtime .enable_cudagraphs (trt_model ) as cudagraphs_module :
233
+ record_llm_perf (
234
+ cudagraphs_module ,
235
+ "Dynamo" ,
236
+ input_tensors ,
237
+ precision ,
238
+ osl ,
239
+ batch_size ,
240
+ iters ,
241
+ compile_time_s ,
242
+ )
243
+ else :
244
+ record_llm_perf (
245
+ trt_model ,
246
+ "Dynamo" ,
247
+ input_tensors ,
248
+ precision ,
249
+ osl ,
250
+ batch_size ,
251
+ iters ,
252
+ compile_time_s ,
253
+ )
233
254
234
255
235
256
@run_with_try_except
@@ -262,14 +283,27 @@ def run_dynamo(model, input_tensors, params, precision, batch_size):
262
283
),
263
284
cache_built_engines = params .get ("cache_built_engines" , False ),
264
285
reuse_cached_engines = params .get ("reuse_cached_engines" , False ),
286
+ use_python_runtime = params .get ("use_python_runtime" , False ),
265
287
)
266
288
end_compile = timeit .default_timer ()
267
289
compile_time_s = end_compile - start_compile
268
290
iters = params .get ("iterations" , 20 )
269
291
270
- record_perf (
271
- model , "Dynamo" , input_tensors , precision , iters , batch_size , compile_time_s
272
- )
292
+ if params .get ("enable_cuda_graph" , False ):
293
+ with torchtrt .runtime .enable_cudagraphs (model ) as cudagraphs_module :
294
+ record_perf (
295
+ cudagraphs_module ,
296
+ "Dynamo" ,
297
+ input_tensors ,
298
+ precision ,
299
+ iters ,
300
+ batch_size ,
301
+ compile_time_s ,
302
+ )
303
+ else :
304
+ record_perf (
305
+ model , "Dynamo" , input_tensors , precision , iters , batch_size , compile_time_s
306
+ )
273
307
274
308
275
309
@run_with_try_except
@@ -292,6 +326,7 @@ def run_torch_compile(model, input_tensors, params, precision, batch_size):
292
326
"enabled_precisions" : {precision_to_dtype (precision )},
293
327
"truncate" : params .get ("truncate" , False ),
294
328
"min_block_size" : params .get ("min_block_size" , 1 ),
329
+ "use_python_runtime" : params .get ("use_python_runtime" , False ),
295
330
}
296
331
start_compile = timeit .default_timer ()
297
332
model = torch .compile (model , backend = "tensorrt" , dynamic = None , options = compile_spec )
@@ -300,15 +335,27 @@ def run_torch_compile(model, input_tensors, params, precision, batch_size):
300
335
compile_time_s = end_compile - start_compile
301
336
iters = params .get ("iterations" , 20 )
302
337
303
- record_perf (
304
- model ,
305
- "torch_compile" ,
306
- input_tensors ,
307
- precision ,
308
- iters ,
309
- batch_size ,
310
- compile_time_s ,
311
- )
338
+ if params .get ("enable_cuda_graph" , False ):
339
+ with torchtrt .runtime .enable_cudagraphs (model ) as cudagraphs_module :
340
+ record_perf (
341
+ cudagraphs_module ,
342
+ "torch_compile" ,
343
+ input_tensors ,
344
+ precision ,
345
+ iters ,
346
+ batch_size ,
347
+ compile_time_s ,
348
+ )
349
+ else :
350
+ record_perf (
351
+ model ,
352
+ "torch_compile" ,
353
+ input_tensors ,
354
+ precision ,
355
+ iters ,
356
+ batch_size ,
357
+ compile_time_s ,
358
+ )
312
359
313
360
314
361
@run_with_try_except
@@ -320,9 +367,13 @@ def run_hf_inductor(model, input_tensors, params, precision, batch_size):
320
367
# Mark dynamic shapes for input sequence
321
368
input_seq = input_tensors [0 ]
322
369
torch ._dynamo .mark_dynamic (input_seq , 1 , min = 1 , max = osl )
370
+ mode = "max-autotune"
371
+ if params .get ("enable_cuda_graph" , False ):
372
+ mode = "reduce-overhead"
373
+
323
374
start_compile = timeit .default_timer ()
324
375
# Compile the model
325
- model = torch .compile (model , backend = "inductor" , dynamic = None , mode = "max-autotune" )
376
+ model = torch .compile (model , backend = "inductor" , dynamic = None , mode = mode )
326
377
model (input_seq )
327
378
end_compile = timeit .default_timer ()
328
379
compile_time_s = end_compile - start_compile
@@ -356,15 +407,25 @@ def run_inductor(model, input_tensors, params, precision, batch_size):
356
407
if params ["is_text_llm" ]:
357
408
return run_hf_inductor (model , input_tensors , params , precision , batch_size )
358
409
410
+ mode = "max-autotune"
411
+ if params .get ("enable_cuda_graph" , False ):
412
+ mode = "reduce-overhead"
413
+
359
414
start_compile = timeit .default_timer ()
360
- model = torch .compile (model , backend = "inductor" , dynamic = None , mode = "max-autotune" )
415
+ model = torch .compile (model , backend = "inductor" , dynamic = None , mode = mode )
361
416
model (* input_tensors )
362
417
end_compile = timeit .default_timer ()
363
418
compile_time_s = end_compile - start_compile
364
419
iters = params .get ("iterations" , 20 )
365
420
366
421
record_perf (
367
- model , "inductor" , input_tensors , precision , iters , batch_size , compile_time_s
422
+ model ,
423
+ "inductor" ,
424
+ input_tensors ,
425
+ precision ,
426
+ iters ,
427
+ batch_size ,
428
+ compile_time_s ,
368
429
)
369
430
370
431
@@ -587,6 +648,16 @@ def run(
587
648
action = "store_true" ,
588
649
help = "Boolean flag to determine if the user provided model is a TRT engine or not" ,
589
650
)
651
+ arg_parser .add_argument (
652
+ "--use_python_runtime" ,
653
+ action = "store_true" ,
654
+ help = "Whether to use Python runtime or not. Using C++ runtime by default" ,
655
+ )
656
+ arg_parser .add_argument (
657
+ "--enable_cuda_graph" ,
658
+ action = "store_true" ,
659
+ help = "Whether to enable CUDA Graph. It is not used by default" ,
660
+ )
590
661
arg_parser .add_argument (
591
662
"--report" ,
592
663
type = str ,
0 commit comments