Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/scripts/generate-release-matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"wheel": ["3.10", "3.11", "3.12", "3.13"],
"tarball": ["3.11"],
}
sbsa_container_image: str = "quay.io/pypa/manylinux_2_34_aarch64"
sbsa_container_image: str = "quay.io/pypa/manylinux_2_39_aarch64"

CXX11_TARBALL_CONTAINER_IMAGE = {
"cu130": "pytorch/libtorch-cxx11-builder:cuda13.0-main",
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/build_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -413,5 +413,5 @@ jobs:
PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{inputs.is-release-wheel}}-${{inputs.is-release-tarball}}-${{inputs.use-rtx}}-${{inputs.architecture}}-${{inputs.is-jetpack}}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{inputs.is-release-wheel}}-${{inputs.is-release-tarball}}-${{inputs.use-rtx}}-${{inputs.architecture}}-${{inputs.is-jetpack}}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ startsWith(github.ref, 'refs/tags/') && github.ref_name || 'no-tag' }}
cancel-in-progress: true
2 changes: 1 addition & 1 deletion .github/workflows/build_windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -438,5 +438,5 @@ jobs:
architecture: ${{ inputs.architecture }}

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ inputs.repository }}-${{ inputs.is-release-wheel }}-${{ inputs.is-release-tarball }}-${{ github.event_name == 'workflow_dispatch' }}
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ inputs.repository }}-${{ inputs.is-release-wheel }}-${{ inputs.is-release-tarball }}-${{ github.event_name == 'workflow_dispatch' }}-${{ startsWith(github.ref, 'refs/tags/') && github.ref_name || 'no-tag' }}
cancel-in-progress: true
3 changes: 2 additions & 1 deletion .github/workflows/release-linux-aarch64.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: Release aarch64 Linux wheels and tarball artifacts

on:
pull_request:
push:
tags:
# NOTE: Binary build pipelines should only get triggered on release candidate builds
Expand Down Expand Up @@ -128,5 +129,5 @@ jobs:
architecture: "aarch64"

concurrency:
group: ${{ github.workflow }}-aarch64-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
group: ${{ github.workflow }}-aarch64-release-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
cancel-in-progress: true
2 changes: 1 addition & 1 deletion .github/workflows/release-linux-x86_64.yml
Original file line number Diff line number Diff line change
Expand Up @@ -126,5 +126,5 @@ jobs:
is-release-wheel: true

concurrency:
group: ${{ github.workflow }}-x86_64-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
group: ${{ github.workflow }}-x86_64-release-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
cancel-in-progress: true
8 changes: 7 additions & 1 deletion core/runtime/TRTEngine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@ TRTEngine::TRTEngine(
}
TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to create TensorRT execution context");

// Pre-allocate placeholder for empty tensors (TensorRT requires non-null addresses)
cudaMalloc(&empty_tensor_placeholder, 1);

runtime_states.old_cudagraphs = CUDAGRAPHS_MODE;
runtime_states.old_pre_allocated_outputs = false;
runtime_states.context_changed = false;
Expand Down Expand Up @@ -264,6 +267,9 @@ TRTEngine::~TRTEngine() {
trt_engine_profiler.reset();
exec_ctx.reset();
cuda_engine.reset();
if (empty_tensor_placeholder) {
cudaFree(empty_tensor_placeholder);
}
rt.reset();
}

Expand Down Expand Up @@ -315,7 +321,7 @@ void TRTEngine::set_profile_format(std::string format) {
}

std::string TRTEngine::get_engine_layer_info() {
auto inspector = cuda_engine->createEngineInspector();
auto inspector = make_trt(cuda_engine->createEngineInspector());
return inspector->getEngineInformation(nvinfer1::LayerInformationFormat::kJSON);
}

Expand Down
3 changes: 3 additions & 0 deletions core/runtime/TRTEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,9 @@ struct TRTEngine : torch::CustomClassHolder {
bool use_pre_allocated_outputs = false;
std::vector<at::Tensor> pre_allocated_outputs;

// Single placeholder buffer for empty tensor inputs (allocated once, reused)
void* empty_tensor_placeholder = nullptr;

// Output Allocator-Related Functionality
bool requires_output_allocator = false; // engine requires output allocator
bool use_output_allocator_outputs = false; // users specify to use output allocator
Expand Down
20 changes: 14 additions & 6 deletions core/runtime/execute_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,18 +149,26 @@ void setup_input_tensors(
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setInputShape(name.c_str(), dims), "Error while setting the input shape");

at::Tensor final_input;
if (cudagraphs_enabled) {
// If using CUDAGraphs copy formatted input to the corresponding persistent input buffer
compiled_engine->input_buffers[i].copy_(formatted_inputs.back(), true);
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), compiled_engine->input_buffers[i].data_ptr()),
"Error while setting the input tensor address for inputs");
final_input = compiled_engine->input_buffers[i];
} else {
// Otherwise use the formatted buffer directly
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), formatted_inputs.back().data_ptr()),
"Error while setting the input tensor address for inputs");
final_input = formatted_inputs.back();
}

// Get tensor address, using placeholder for empty tensors
// TensorRT requires non-null address even if numel() = 0
// empty_tensor_placeholder is pre-allocated in TRTEngine constructor
void* input_addr = (final_input.numel() == 0 || final_input.data_ptr() == nullptr)
? compiled_engine->empty_tensor_placeholder
: final_input.data_ptr();

TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), input_addr),
"Failed to bind tensor address for " << name);
}
}
}
Expand Down
1 change: 1 addition & 0 deletions cpp/bin/torchtrtc/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ int main(int argc, char** argv) {
}

if (enabled_precisions) {
compile_settings.enabled_precisions.clear();
for (const auto& precision : args::get(enabled_precisions)) {
auto dtype = torchtrtc::parserutil::parse_dtype(precision);
if (dtype == torchtrt::DataType::kFloat) {
Expand Down
58 changes: 58 additions & 0 deletions examples/dynamo/compile_with_dynamic_inputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import logging

import torch
import torch.nn as nn
import torch_tensorrt

logging.basicConfig(level=logging.DEBUG)

torch.manual_seed(0)


class ExpandReshapeModel(nn.Module):
def __init__(self, embed_dim: int):
super().__init__()
self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
self.embed_dim = embed_dim
self.qkv_proj = nn.Linear(self.embed_dim, self.embed_dim * 3)

def forward(self, x: torch.Tensor):
batch_size = x.shape[0]
cls_token = self.cls_token.expand(batch_size, -1, -1)
x = torch.cat([cls_token, x], dim=1)
x = self.qkv_proj(x)
reshaped_qkv = x.reshape(batch_size, x.size(1), 3, 12, -1)
return reshaped_qkv


model = ExpandReshapeModel(embed_dim=768).cuda().eval()
x = torch.randn(4, 196, 768).cuda()

# 1. JIT: torch.compile
x1 = x.clone()
torch._dynamo.mark_dynamic(x1, index=0, min=2, max=32)
trt_module = torch.compile(model, backend="tensorrt")
out1 = trt_module(x1)

# 2. AOT: torch_tensorrt.compile
x2 = x.clone()
example_input = torch_tensorrt.Input(
min_shape=[1, 196, 768],
opt_shape=[4, 196, 768],
max_shape=[32, 196, 768],
dtype=torch.float32,
)
trt_module = torch_tensorrt.compile(model, ir="dynamo", inputs=example_input)
out2 = trt_module(x2)

# 3. AOT: torch.export + Dynamo compile
x3 = x.clone()
bs = torch.export.Dim("bs", min=1, max=32)
dynamic_shapes = {"x": {0: bs}}
exp_program = torch.export.export(model, (x3,), dynamic_shapes=dynamic_shapes)
trt_module = torch_tensorrt.dynamo.compile(exp_program, (x3,))
out3 = trt_module(x3)

assert torch.allclose(out1, out2)
assert torch.allclose(out1, out3)
assert torch.allclose(out2, out3)
3 changes: 3 additions & 0 deletions examples/dynamo/torch_compile_resnet_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
model,
ir="torch_compile",
inputs=inputs,
use_explicit_typing=False,
enabled_precisions=enabled_precisions,
workspace_size=workspace_size,
min_block_size=min_block_size,
Expand Down Expand Up @@ -86,6 +87,7 @@
model,
ir="torch_compile",
inputs=inputs_bs8,
use_explicit_typing=False,
enabled_precisions=enabled_precisions,
workspace_size=workspace_size,
min_block_size=min_block_size,
Expand All @@ -111,6 +113,7 @@
dtype=torch.half,
)
],
"use_explicit_typing": False,
"enabled_precisions": enabled_precisions,
"ir": "dynamo",
}
Expand Down
16 changes: 14 additions & 2 deletions py/torch_tensorrt/_Input.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
from __future__ import annotations

import logging
from enum import Enum
from typing import Any, Dict, List, Optional, Sequence, Tuple

import torch
from torch_tensorrt._enums import dtype, memory_format

logger = logging.getLogger(__name__)


class Input(object):
"""
Expand Down Expand Up @@ -149,6 +152,16 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
}
self.shape_mode = Input._ShapeMode.DYNAMIC

# Warn if min_shape has any 0 dimension (empty tensor) - TensorRT doesn't support this
# @apbose: Is this warning necessary?
if any(dim == 0 for dim in self.shape["min_shape"]):
logger.warning(
f"min_shape contains a 0 dimension: {self.shape['min_shape']}. "
"TensorRT does not support dynamic shapes with min dimension of 0 (empty tensors). "
"TensorRT will internally clamp min dimensions to 1, which may cause runtime errors "
"if you try to run inference with empty tensor inputs."
)

else:
raise ValueError(
f"Unexpected number of positional arguments for class Input \n Found {len(args)} arguments, expected either zero or a single positional arguments"
Expand Down Expand Up @@ -384,7 +397,7 @@ def example_tensor(
dtype=self.dtype.to(torch.dtype, use_default=True)
)
else:
RuntimeError(
raise RuntimeError(
f"Input shape is dynamic but shapes are not provided as sequence (found: {self.shape})"
)
else:
Expand Down Expand Up @@ -412,4 +425,3 @@ def example_tensor(
raise ValueError(
"Requested an example tensor from a dynamic shaped input but did not specific which profile field to use."
)
raise
6 changes: 6 additions & 0 deletions py/torch_tensorrt/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@ def is_tegra_platform() -> bool:
return False


def is_orin() -> bool:
if torch.cuda.get_device_capability() in [(8, 7)]:
return True
return False


def is_thor() -> bool:
if torch.cuda.get_device_capability() in [(11, 0)]:
return True
Expand Down
23 changes: 9 additions & 14 deletions py/torch_tensorrt/dynamo/conversion/_ConverterRegistry.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

import tensorrt as trt
import torch
import torch_tensorrt
from torch import SymBool, SymFloat, SymInt
from torch._ops import OpOverloadPacket
from torch.fx.node import Argument, Node, Target, _get_qualified_name
Expand Down Expand Up @@ -536,7 +535,7 @@ def __contains__(self, key: Target | Node) -> bool:
def get_all_converters_with_target(
self, key: Target, return_registry_info: bool = False
) -> Tuple[
Union[List[Any], Dict[str, int], None]
List[Any], Optional[Dict[str, int]]
]: # TODO: Narrow to ConverterImplSignature this when we can remove FX converters
"""Get all converters across all registries for the target

Expand All @@ -547,7 +546,7 @@ def get_all_converters_with_target(

# Store count of number of registered converters per registry
if return_registry_info:
registry_data = {name: 0 for name in self.registry_names}
registry_data = dict.fromkeys(self.registry_names, 0)

for index, registry in enumerate(self.registries):
if key in registry:
Expand Down Expand Up @@ -622,22 +621,18 @@ def display_all_available_converters(self) -> str:
return available_converters


# Initialize dynamo converter registry with the FX and Dynamo aten registries
# Note the Dynamo registry is listed first, for precedence
registries = [
DYNAMO_ATEN_CONVERTERS,
# Initialize dynamo converter registry with Dynamo aten converters only
# FX converters are not loaded here - they are legacy and should only be used
# in the FX frontend, not as fallbacks in the dynamo frontend
registries: List[
Dict[Target, Union[Callable[..., Any], Sequence[ConverterSupport]]]
] = [
DYNAMO_ATEN_CONVERTERS, # type: ignore[list-item]
]
registry_names = ["Dynamo ATen Converters Registry"]
registry_calling_conventions = [
CallingConvention.CTX,
]
if torch_tensorrt.ENABLED_FEATURES.fx_frontend:
from torch_tensorrt.fx.converter_registry import CONVERTERS as FX_CONVERTERS

registries.append(FX_CONVERTERS)
registry_names.append("FX Legacy ATen Converters Registry")
registry_calling_conventions.append(CallingConvention.LEGACY)


DYNAMO_CONVERTERS: ConverterRegistry = ConverterRegistry(
registries,
Expand Down
Loading