Skip to content

Commit 8977704

Browse files
committed
fix: docs + new dep group
1 parent e06f82a commit 8977704

12 files changed

+961
-29
lines changed

docsrc/index.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ Tutorials
5858
------------
5959

6060
* :ref:`torch_compile_advanced_usage`
61+
* :ref:`compile_with_dynamic_inputs`
6162
* :ref:`vgg16_ptq`
6263
* :ref:`engine_caching_example`
6364
* :ref:`engine_caching_bert_example`
@@ -70,6 +71,7 @@ Tutorials
7071
* :ref:`auto_generate_plugins`
7172
* :ref:`mutable_torchtrt_module_example`
7273
* :ref:`weight_streaming_example`
74+
* :ref:`dynamic_memory_allocation`
7375
* :ref:`pre_allocated_output_example`
7476
* :ref:`debugger_example`
7577

@@ -79,6 +81,7 @@ Tutorials
7981
:hidden:
8082

8183
tutorials/_rendered_examples/dynamo/torch_compile_advanced_usage
84+
tutorials/_rendered_examples/dynamo/compile_with_dynamic_inputs
8285
tutorials/_rendered_examples/dynamo/vgg16_ptq
8386
tutorials/_rendered_examples/dynamo/engine_caching_example
8487
tutorials/_rendered_examples/dynamo/engine_caching_bert_example
@@ -91,6 +94,7 @@ Tutorials
9194
tutorials/_rendered_examples/dynamo/auto_generate_plugins
9295
tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example
9396
tutorials/_rendered_examples/dynamo/weight_streaming_example
97+
tutorials/_rendered_examples/dynamo/dynamic_memory_allocation
9498
tutorials/_rendered_examples/dynamo/pre_allocated_output_example
9599

96100
Dynamo Frontend

examples/dynamo/autocast_example.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -106,17 +106,19 @@ def forward(self, x):
106106
]
107107
should_be_bf16 = [autocast_outs[3], autocast_outs[4], autocast_outs[6]]
108108

109-
assert all(
110-
a.dtype == torch.float32 for a in should_be_fp32
111-
), "Some Autocast outputs are not float32!"
112-
assert all(
113-
a.dtype == torch.float16 for a in should_be_fp16
114-
), "Some Autocast outputs are not float16!"
115-
assert all(
116-
a.dtype == torch.bfloat16 for a in should_be_bf16
117-
), "Some Autocast outputs are not bfloat16!"
109+
assert all(a.dtype == torch.float32 for a in should_be_fp32), (
110+
"Some Autocast outputs are not float32!"
111+
)
112+
assert all(a.dtype == torch.float16 for a in should_be_fp16), (
113+
"Some Autocast outputs are not float16!"
114+
)
115+
assert all(a.dtype == torch.bfloat16 for a in should_be_bf16), (
116+
"Some Autocast outputs are not bfloat16!"
117+
)
118118
for i, (a, w) in enumerate(zip(autocast_outs, pytorch_outs)):
119119
assert torch.allclose(
120120
a.to(torch.float32), w.to(torch.float32), atol=1e-2, rtol=1e-2
121-
), f"Autocast and Pytorch outputs do not match! autocast_outs[{i}] = {a}, pytorch_outs[{i}] = {w}"
121+
), (
122+
f"Autocast and Pytorch outputs do not match! autocast_outs[{i}] = {a}, pytorch_outs[{i}] = {w}"
123+
)
122124
print("All dtypes and values match!")

examples/dynamo/compile_with_dynamic_inputs.py

Lines changed: 77 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,20 @@
1+
"""
2+
.. _compile_with_dynamic_inputs:
3+
4+
Compiling Models with Dynamic Input Shapes
5+
==========================================================
6+
7+
Dynamic shapes are essential when your model
8+
needs to handle varying batch sizes or sequence lengths at inference time without recompilation.
9+
10+
The example uses a Vision Transformer-style model with expand and reshape operations,
11+
which are common patterns that benefit from dynamic shape handling.
12+
"""
13+
14+
# %%
15+
# Imports and Model Definition
16+
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
17+
118
import logging
219

320
import torch
@@ -8,7 +25,13 @@
825

926
torch.manual_seed(0)
1027

28+
# %%
29+
1130

31+
# Define a model with expand and reshape operations
32+
# This is a simplified Vision Transformer pattern with:
33+
# - A learnable class token that needs to expand to match batch size
34+
# - A QKV projection followed by reshaping for multi-head attention
1235
class ExpandReshapeModel(nn.Module):
1336
def __init__(self, embed_dim: int):
1437
super().__init__()
@@ -28,13 +51,40 @@ def forward(self, x: torch.Tensor):
2851
model = ExpandReshapeModel(embed_dim=768).cuda().eval()
2952
x = torch.randn(4, 196, 768).cuda()
3053

31-
# 1. JIT: torch.compile
54+
# %%
55+
# Approach 1: JIT Compilation with `torch.compile`
56+
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
57+
#
58+
# The first approach uses PyTorch's `torch.compile` with the TensorRT backend.
59+
# This is a Just-In-Time (JIT) compilation method where the model is compiled
60+
# during the first inference call.
61+
#
62+
# Key points:
63+
#
64+
# - Use `torch._dynamo.mark_dynamic()` to specify which dimensions are dynamic
65+
# - The `index` parameter indicates which dimension (0 = batch dimension)
66+
# - Provide `min` and `max` bounds for the dynamic dimension
67+
# - The model will work for any batch size within the specified range
68+
3269
x1 = x.clone()
3370
torch._dynamo.mark_dynamic(x1, index=0, min=2, max=32)
3471
trt_module = torch.compile(model, backend="tensorrt")
3572
out1 = trt_module(x1)
3673

37-
# 2. AOT: torch_tensorrt.compile
74+
# %%
75+
# Approach 2: AOT Compilation with `torch_tensorrt.compile`
76+
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
77+
#
78+
# The second approach uses Ahead-Of-Time (AOT) compilation with `torch_tensorrt.compile`.
79+
# This compiles the model upfront before inference.
80+
#
81+
# Key points:
82+
#
83+
# - Use `torch_tensorrt.Input()` to specify dynamic shape ranges
84+
# - Provide `min_shape`, `opt_shape`, and `max_shape` for each input
85+
# - The `opt_shape` is used for optimization and should represent typical input sizes
86+
# - Set `ir="dynamo"` to use the Dynamo frontend
87+
3888
x2 = x.clone()
3989
example_input = torch_tensorrt.Input(
4090
min_shape=[1, 196, 768],
@@ -45,14 +95,38 @@ def forward(self, x: torch.Tensor):
4595
trt_module = torch_tensorrt.compile(model, ir="dynamo", inputs=example_input)
4696
out2 = trt_module(x2)
4797

48-
# 3. AOT: torch.export + Dynamo compile
98+
# %%
99+
# Approach 3: AOT with `torch.export` + Dynamo Compile
100+
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
101+
#
102+
# The third approach uses PyTorch 2.0's `torch.export` API combined with
103+
# Torch-TensorRT's Dynamo compiler. This provides the most explicit control
104+
# over dynamic shapes.
105+
#
106+
# Key points:
107+
#
108+
# - Use `torch.export.Dim()` to define symbolic dimensions with constraints
109+
# - Create a `dynamic_shapes` dictionary mapping inputs to their dynamic dimensions
110+
# - Export the model to an `ExportedProgram` with these constraints
111+
# - Compile the exported program with `torch_tensorrt.dynamo.compile`
112+
49113
x3 = x.clone()
50114
bs = torch.export.Dim("bs", min=1, max=32)
51115
dynamic_shapes = {"x": {0: bs}}
52116
exp_program = torch.export.export(model, (x3,), dynamic_shapes=dynamic_shapes)
53117
trt_module = torch_tensorrt.dynamo.compile(exp_program, (x3,))
54118
out3 = trt_module(x3)
55119

120+
# %%
121+
# Verify All Approaches Produce Identical Results
122+
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
123+
#
124+
# All three approaches should produce the same numerical results.
125+
# This verification ensures that dynamic shape handling works correctly
126+
# across different compilation methods.
127+
56128
assert torch.allclose(out1, out2)
57129
assert torch.allclose(out1, out3)
58130
assert torch.allclose(out2, out3)
131+
132+
print("All three approaches produced identical results!")

examples/dynamo/custom_kernel_plugins.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -514,9 +514,7 @@ def deserialize_plugin(self, name: str, data: bytes) -> CircularPaddingPlugin:
514514
from torch_tensorrt.fx.converters.converter_utils import set_layer_name
515515

516516

517-
@dynamo_tensorrt_converter(
518-
torch.ops.torchtrt_ex.triton_circular_pad.default
519-
) # type: ignore
517+
@dynamo_tensorrt_converter(torch.ops.torchtrt_ex.triton_circular_pad.default) # type: ignore
520518
# Recall the schema defined above:
521519
# torch.ops.torchtrt_ex.triton_circular_pad.default(Tensor x, IntList padding) -> Tensor
522520
def circular_padding_converter(

examples/dynamo/debugger_example.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@
5050
"remove_detach"
5151
], # fx graph visualization before certain lowering pass
5252
):
53-
5453
trt_gm = torch_trt.dynamo.compile(
5554
exp_program,
5655
tuple(inputs),

examples/dynamo/dynamic_memory_allocation.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,24 @@
1+
"""
2+
.. _dynamic_memory_allocation:
3+
4+
Dynamic Memory Allocation
5+
==========================================================
6+
7+
This script demonstrates how to use dynamic memory allocation with Torch-TensorRT
8+
to reduce GPU memory footprint. When enabled, TensorRT engines allocate and deallocate resources
9+
dynamically during inference, which can significantly reduce peak memory usage.
10+
11+
This is particularly useful when:
12+
13+
- Running multiple models on the same GPU
14+
- Working with limited GPU memory
15+
- Memory usage needs to be minimized between inference calls
16+
"""
17+
118
# %%
19+
# Imports and Model Definition
20+
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
21+
222
import gc
323
import time
424

@@ -11,6 +31,19 @@
1131
torch.manual_seed(5)
1232
inputs = [torch.rand((100, 3, 224, 224)).to("cuda")]
1333

34+
# %%
35+
# Compilation Settings with Dynamic Memory Allocation
36+
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
37+
#
38+
# Key settings for dynamic memory allocation:
39+
#
40+
# - ``dynamically_allocate_resources=True``: Enables dynamic resource allocation
41+
# - ``lazy_engine_init=True``: Delays engine initialization until first inference
42+
# - ``immutable_weights=False``: Allows weight refitting if needed
43+
#
44+
# With these settings, the engine will allocate GPU memory only when needed
45+
# and deallocate it after inference completes.
46+
1447
settings = {
1548
"ir": "dynamo",
1649
"use_python_runtime": False,
@@ -25,6 +58,20 @@
2558
print((torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]) / 1024**3)
2659
compiled_module(*inputs)
2760

61+
# %%
62+
# Runtime Resource Allocation Control
63+
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
64+
#
65+
# You can control resource allocation behavior at runtime using the
66+
# ``ResourceAllocationStrategy`` context manager. This allows you to:
67+
#
68+
# - Switch between dynamic and static allocation modes
69+
# - Control when resources are allocated and deallocated
70+
# - Optimize memory usage for specific inference patterns
71+
#
72+
# In this example, we temporarily disable dynamic allocation to keep
73+
# resources allocated between inference calls, which can improve performance
74+
# when running multiple consecutive inferences.
2875

2976
time.sleep(30)
3077
with torch_trt.dynamo.runtime.ResourceAllocationStrategy(
@@ -43,3 +90,20 @@
4390
(torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]) / 1024**3,
4491
)
4592
compiled_module(*inputs)
93+
94+
# %%
95+
# Memory Usage Comparison
96+
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
97+
#
98+
# Dynamic memory allocation trades off some performance for reduced memory footprint:
99+
#
100+
# **Benefits:**
101+
#
102+
# - Lower peak GPU memory usage
103+
# - Reduced memory pressure on shared GPUs
104+
#
105+
# **Considerations:**
106+
#
107+
# - Slight overhead from allocation/deallocation
108+
# - Best suited for scenarios where memory is constrained
109+
# - May not be necessary for single-model deployments with ample memory

examples/dynamo/low_cpu_memory_compilation.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@ def forward(self, x):
7373
logging_dir="/home/profile/logging/moe",
7474
engine_builder_monitor=False,
7575
):
76-
7776
exp_program = torch.export.export(model, tuple(inputs))
7877
trt_gm = torchtrt.dynamo.compile(
7978
exp_program,

examples/dynamo/mutable_torchtrt_module_example.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,9 @@
5757
with torch.no_grad():
5858
expected_outputs, refitted_outputs = model2(*inputs), mutable_module(*inputs)
5959
for expected_output, refitted_output in zip(expected_outputs, refitted_outputs):
60-
assert torch.allclose(
61-
expected_output, refitted_output, 1e-2, 1e-2
62-
), "Refit Result is not correct. Refit failed"
60+
assert torch.allclose(expected_output, refitted_output, 1e-2, 1e-2), (
61+
"Refit Result is not correct. Refit failed"
62+
)
6363

6464
print("Refit successfully!")
6565

examples/dynamo/nvrtc_aot_plugin.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -239,8 +239,8 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
239239
with torch.no_grad():
240240
for i in range(10):
241241
res = model_trt(input)
242-
assert torch.allclose(
243-
res, model(input), rtol=1e-2, atol=1e-2
244-
), "Results do not match!"
242+
assert torch.allclose(res, model(input), rtol=1e-2, atol=1e-2), (
243+
"Results do not match!"
244+
)
245245

246246
print("Inference successful!")

examples/dynamo/refit_engine_example.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -100,13 +100,14 @@
100100

101101
# Check the output
102102
with torch.no_grad():
103-
expected_outputs, refitted_outputs = exp_program2.module()(*inputs), new_trt_gm(
104-
*inputs
103+
expected_outputs, refitted_outputs = (
104+
exp_program2.module()(*inputs),
105+
new_trt_gm(*inputs),
105106
)
106107
for expected_output, refitted_output in zip(expected_outputs, refitted_outputs):
107-
assert torch.allclose(
108-
expected_output, refitted_output, 1e-2, 1e-2
109-
), "Refit Result is not correct. Refit failed"
108+
assert torch.allclose(expected_output, refitted_output, 1e-2, 1e-2), (
109+
"Refit Result is not correct. Refit failed"
110+
)
110111

111112
print("Refit successfully!")
112113

0 commit comments

Comments
 (0)