Skip to content

Commit 2a09f27

Browse files
ankan-bangedoensmaxgaugarg-nvirauthrishikeshm
authored
NV TensorRT RTX EP - initial commit (#24456)
New EP - currently based on existing TensorRT EP but meant to be used on RTX GPUs with a lean version of TensorRT. ### Description Adding a new EP based on TensorRT EP. This is going to use a special version of TensorRT optimized for RTX GPUs. In the future we plan to make changes to the EP to streamline it further (e.g, get rid of dependency on CUDA EP completely). ### Motivation and Context The new TensorRT for RTX is going to have: 1. Much smaller footprint 2. Much faster model compile/load times. 3. Better usability in terms of use of cached models across multiple RTX GPUs. This effort is also targeting WCR ML workflows. --------- Co-authored-by: Maximilian Müller <[email protected]> Co-authored-by: Gaurav Garg <[email protected]> Co-authored-by: iraut <[email protected]> Co-authored-by: Hrishikesh Manohar <[email protected]> Co-authored-by: Maximilian Müller <[email protected]>
1 parent 6c8cb6a commit 2a09f27

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+6615
-26
lines changed

cmake/CMakeLists.txt

+11
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ option(onnxruntime_ENABLE_MICROSOFT_INTERNAL "Use this option to enable/disable
107107
option(onnxruntime_USE_VITISAI "Build with Vitis-AI" OFF)
108108
option(onnxruntime_USE_TENSORRT "Build with TensorRT support" OFF)
109109
option(onnxruntime_USE_TENSORRT_BUILTIN_PARSER "Use TensorRT builtin parser" OFF)
110+
option(onnxruntime_USE_NV "Build with TensorRT support" OFF)
110111
option(onnxruntime_ENABLE_LTO "Enable link time optimization" OFF)
111112
option(onnxruntime_CROSS_COMPILING "Cross compiling onnx runtime" OFF)
112113
option(onnxruntime_GCOV_COVERAGE "Compile with options necessary to run code coverage" OFF)
@@ -250,6 +251,7 @@ option(onnxruntime_USE_LOCK_FREE_QUEUE "Build with lock-free task queue for thre
250251
option(onnxruntime_FORCE_GENERIC_ALGORITHMS "Disable optimized arch-specific algorithms. Use only for testing and debugging generic algorithms." OFF)
251252

252253
option(onnxruntime_USE_TENSORRT_INTERFACE "Build ONNXRuntime shared lib which is compatible with TensorRT EP interface" OFF)
254+
option(onnxruntime_USE_NV_INTERFACE "Build ONNXRuntime shared lib which is compatible with NV EP interface" OFF)
253255
option(onnxruntime_USE_CUDA_INTERFACE "Build ONNXRuntime shared lib which is compatible with Cuda EP interface" OFF)
254256
option(onnxruntime_USE_OPENVINO_INTERFACE "Build ONNXRuntime shared lib which is compatible with OpenVINO EP interface" OFF)
255257
option(onnxruntime_USE_VITISAI_INTERFACE "Build ONNXRuntime shared lib which is compatible with Vitis-AI EP interface" OFF)
@@ -946,6 +948,15 @@ if (onnxruntime_USE_TENSORRT_INTERFACE AND (NOT onnxruntime_USE_TENSORRT))
946948
list(APPEND ORT_INTERFACE_FLAGS -DUSE_TENSORRT=1)
947949
endif()
948950

951+
if (onnxruntime_USE_NV)
952+
list(APPEND ORT_PROVIDER_FLAGS -DUSE_NV=1)
953+
list(APPEND ONNXRUNTIME_PROVIDER_NAMES nv_tensorrt_rtx)
954+
endif()
955+
956+
if (onnxruntime_USE_NV_INTERFACE AND (NOT onnxruntime_USE_NV))
957+
list(APPEND ORT_INTERFACE_FLAGS -DUSE_NV=1)
958+
endif()
959+
949960
if (onnxruntime_USE_RKNPU)
950961
list(APPEND ORT_PROVIDER_FLAGS -DUSE_RKNPU=1)
951962
list(APPEND ONNXRUNTIME_PROVIDER_NAMES rknpu)

cmake/onnxruntime_framework.cmake

+1-1
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ endif()
6363
if(onnxruntime_ENABLE_INSTRUMENT)
6464
target_compile_definitions(onnxruntime_framework PRIVATE ONNXRUNTIME_ENABLE_INSTRUMENT)
6565
endif()
66-
if(onnxruntime_USE_TENSORRT OR onnxruntime_USE_NCCL)
66+
if(onnxruntime_USE_TENSORRT OR onnxruntime_USE_NCCL OR onnxruntime_USE_NV)
6767
# TODO: for now, core framework depends on CUDA. It should be moved to TensorRT EP
6868
# TODO: provider_bridge_ort.cc should not include nccl.h
6969
target_include_directories(onnxruntime_framework PRIVATE ${ONNXRUNTIME_ROOT} PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})

cmake/onnxruntime_providers.cmake

+4
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,10 @@ if (onnxruntime_USE_TENSORRT)
132132
include(onnxruntime_providers_tensorrt.cmake)
133133
endif()
134134

135+
if (onnxruntime_USE_NV)
136+
include(onnxruntime_providers_nv.cmake)
137+
endif()
138+
135139
if (onnxruntime_USE_VITISAI)
136140
include(onnxruntime_providers_vitisai.cmake)
137141
endif()

cmake/onnxruntime_providers_nv.cmake

+202
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
# Copyright (c) Microsoft Corporation. All rights reserved.
2+
# Licensed under the MIT License.
3+
find_package(CUDAToolkit REQUIRED 12.8)
4+
enable_language(CUDA)
5+
if(onnxruntime_DISABLE_CONTRIB_OPS)
6+
message( FATAL_ERROR "To compile TensorRT execution provider contrib ops have to be enabled to dump an engine using com.microsoft:EPContext node." )
7+
endif()
8+
add_definitions(-DUSE_NV=1)
9+
if (onnxruntime_NV_PLACEHOLDER_BUILDER)
10+
add_definitions(-DORT_NV_PLACEHOLDER_BUILDER)
11+
endif()
12+
set(BUILD_LIBRARY_ONLY 1)
13+
add_definitions("-DONNX_ML=1")
14+
add_definitions("-DONNX_NAMESPACE=onnx")
15+
set(CUDA_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
16+
set(TENSORRT_ROOT ${onnxruntime_TENSORRT_HOME})
17+
set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
18+
set(PROTOBUF_LIBRARY ${PROTOBUF_LIB})
19+
if (WIN32)
20+
set(OLD_CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS})
21+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4099 /wd4551 /wd4505 /wd4515 /wd4706 /wd4456 /wd4324 /wd4701 /wd4804 /wd4702 /wd4458 /wd4703")
22+
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
23+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4805")
24+
endif()
25+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -include algorithm")
26+
set(DISABLED_WARNINGS_FOR_TRT /wd4456)
27+
endif()
28+
if ( CMAKE_COMPILER_IS_GNUCC )
29+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter -Wno-missing-field-initializers")
30+
endif()
31+
set(CXX_VERSION_DEFINED TRUE)
32+
33+
find_path(TENSORRT_INCLUDE_DIR NvInfer.h
34+
HINTS ${TENSORRT_ROOT}
35+
PATH_SUFFIXES include)
36+
37+
38+
file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h NVINFER_VER_CONTENT)
39+
string(REGEX MATCH "define NV_TENSORRT_MAJOR * +([0-9]+)" NV_TENSORRT_MAJOR "${NVINFER_VER_CONTENT}")
40+
string(REGEX REPLACE "define NV_TENSORRT_MAJOR * +([0-9]+)" "\\1" NV_TENSORRT_MAJOR "${NV_TENSORRT_MAJOR}")
41+
string(REGEX MATCH "define NV_TENSORRT_MINOR * +([0-9]+)" NV_TENSORRT_MINOR "${NVINFER_VER_CONTENT}")
42+
string(REGEX REPLACE "define NV_TENSORRT_MINOR * +([0-9]+)" "\\1" NV_TENSORRT_MINOR "${NV_TENSORRT_MINOR}")
43+
string(REGEX MATCH "define NV_TENSORRT_PATCH * +([0-9]+)" NV_TENSORRT_PATCH "${NVINFER_VER_CONTENT}")
44+
string(REGEX REPLACE "define NV_TENSORRT_PATCH * +([0-9]+)" "\\1" NV_TENSORRT_PATCH "${NV_TENSORRT_PATCH}")
45+
math(EXPR NV_TENSORRT_MAJOR_INT "${NV_TENSORRT_MAJOR}")
46+
math(EXPR NV_TENSORRT_MINOR_INT "${NV_TENSORRT_MINOR}")
47+
math(EXPR NV_TENSORRT_PATCH_INT "${NV_TENSORRT_PATCH}")
48+
49+
if (NV_TENSORRT_MAJOR)
50+
MESSAGE(STATUS "NV_TENSORRT_MAJOR is ${NV_TENSORRT_MAJOR}")
51+
else()
52+
MESSAGE(STATUS "Can't find NV_TENSORRT_MAJOR macro")
53+
endif()
54+
55+
# Check TRT version >= 10.0.1.6
56+
if ((NV_TENSORRT_MAJOR_INT GREATER 10) OR
57+
(NV_TENSORRT_MAJOR_INT EQUAL 10 AND NV_TENSORRT_MINOR_INT GREATER 0) OR
58+
(NV_TENSORRT_MAJOR_INT EQUAL 10 AND NV_TENSORRT_PATCH_INT GREATER 0))
59+
set(TRT_GREATER_OR_EQUAL_TRT_10_GA ON)
60+
else()
61+
message( FATAL_ERROR "Only TensorRT 10.x or higher is supported." )
62+
endif()
63+
64+
# TensorRT 10 GA onwards, the TensorRT libraries will have major version appended to the end on Windows,
65+
# for example, nvinfer_10.dll, nvonnxparser_10.dll ...
66+
if (WIN32 AND TRT_GREATER_OR_EQUAL_TRT_10_GA)
67+
set(NVINFER_LIB "nvinfer_${NV_TENSORRT_MAJOR}")
68+
set(PARSER_LIB "nvonnxparser_${NV_TENSORRT_MAJOR}")
69+
endif()
70+
71+
if (NOT NVINFER_LIB)
72+
set(NVINFER_LIB "nvinfer")
73+
endif()
74+
75+
if (NOT PARSER_LIB)
76+
set(PARSER_LIB "nvonnxparser")
77+
endif()
78+
79+
MESSAGE(STATUS "Looking for ${NVINFER_LIB}")
80+
81+
find_library(TENSORRT_LIBRARY_INFER ${NVINFER_LIB}
82+
HINTS ${TENSORRT_ROOT}
83+
PATH_SUFFIXES lib lib64 lib/x64)
84+
85+
if (NOT TENSORRT_LIBRARY_INFER)
86+
MESSAGE(STATUS "Can't find ${NVINFER_LIB}")
87+
endif()
88+
89+
if (onnxruntime_USE_TENSORRT_BUILTIN_PARSER)
90+
MESSAGE(STATUS "Looking for ${PARSER_LIB}")
91+
92+
find_library(TENSORRT_LIBRARY_NVONNXPARSER ${PARSER_LIB}
93+
HINTS ${TENSORRT_ROOT}
94+
PATH_SUFFIXES lib lib64 lib/x64)
95+
96+
if (NOT TENSORRT_LIBRARY_NVONNXPARSER)
97+
MESSAGE(STATUS "Can't find ${PARSER_LIB}")
98+
endif()
99+
100+
set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER} ${TENSORRT_LIBRARY_NVONNXPARSER})
101+
MESSAGE(STATUS "Find TensorRT libs at ${TENSORRT_LIBRARY}")
102+
else()
103+
if (TRT_GREATER_OR_EQUAL_TRT_10_GA)
104+
set(ONNX_USE_LITE_PROTO ON)
105+
endif()
106+
onnxruntime_fetchcontent_declare(
107+
onnx_tensorrt
108+
URL ${DEP_URL_onnx_tensorrt}
109+
URL_HASH SHA1=${DEP_SHA1_onnx_tensorrt}
110+
EXCLUDE_FROM_ALL
111+
)
112+
if (NOT CUDA_INCLUDE_DIR)
113+
set(CUDA_INCLUDE_DIR ${CUDAToolkit_INCLUDE_DIRS}) # onnx-tensorrt repo needs this variable to build
114+
endif()
115+
# The onnx_tensorrt repo contains a test program, getSupportedAPITest, which doesn't support Windows. It uses
116+
# unistd.h. So we must exclude it from our build. onnxruntime_fetchcontent_makeavailable is for the purpose.
117+
onnxruntime_fetchcontent_makeavailable(onnx_tensorrt)
118+
include_directories(${onnx_tensorrt_SOURCE_DIR})
119+
set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
120+
if ( CMAKE_COMPILER_IS_GNUCC )
121+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
122+
endif()
123+
if (WIN32)
124+
set(CMAKE_CUDA_FLAGS ${OLD_CMAKE_CUDA_FLAGS})
125+
unset(PROTOBUF_LIBRARY)
126+
unset(OLD_CMAKE_CXX_FLAGS)
127+
unset(OLD_CMAKE_CUDA_FLAGS)
128+
set_target_properties(${PARSER_LIB} PROPERTIES LINK_FLAGS "/ignore:4199")
129+
target_compile_options(nvonnxparser_static PRIVATE /FIio.h /wd4100)
130+
target_compile_options(${PARSER_LIB} PRIVATE /FIio.h /wd4100)
131+
endif()
132+
# Static libraries are just nvonnxparser_static on all platforms
133+
set(onnxparser_link_libs nvonnxparser_static)
134+
set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER})
135+
MESSAGE(STATUS "Find TensorRT libs at ${TENSORRT_LIBRARY}")
136+
endif()
137+
138+
include_directories(${TENSORRT_INCLUDE_DIR})
139+
# ${TENSORRT_LIBRARY} is empty if we link nvonnxparser_static.
140+
# nvonnxparser_static is linked against tensorrt libraries in onnx-tensorrt
141+
# See https://github.com/onnx/onnx-tensorrt/blob/8af13d1b106f58df1e98945a5e7c851ddb5f0791/CMakeLists.txt#L121
142+
# However, starting from TRT 10 GA, nvonnxparser_static doesn't link against tensorrt libraries.
143+
# Therefore, the above code finds ${TENSORRT_LIBRARY_INFER}
144+
set(trt_link_libs ${CMAKE_DL_LIBS} ${TENSORRT_LIBRARY})
145+
file(GLOB_RECURSE onnxruntime_providers_nv_tensorrt_rtx_cc_srcs CONFIGURE_DEPENDS
146+
"${ONNXRUNTIME_ROOT}/core/providers/nv_tensorrt_rtx/*.h"
147+
"${ONNXRUNTIME_ROOT}/core/providers/nv_tensorrt_rtx/*.cc"
148+
"${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
149+
"${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
150+
"${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_stream_handle.h"
151+
"${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_stream_handle.cc"
152+
"${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_graph.h"
153+
"${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_graph.cc"
154+
)
155+
156+
source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_nv_tensorrt_rtx_cc_srcs})
157+
onnxruntime_add_shared_library_module(onnxruntime_providers_nv_tensorrt_rtx ${onnxruntime_providers_nv_tensorrt_rtx_cc_srcs})
158+
onnxruntime_add_include_to_target(onnxruntime_providers_nv_tensorrt_rtx onnxruntime_common)
159+
target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE Eigen3::Eigen onnx flatbuffers::flatbuffers Boost::mp11 safeint_interface Eigen3::Eigen)
160+
add_dependencies(onnxruntime_providers_nv_tensorrt_rtx onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
161+
if (onnxruntime_USE_TENSORRT_BUILTIN_PARSER)
162+
target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
163+
else()
164+
target_link_libraries(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${onnxparser_link_libs} ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS} PUBLIC CUDA::cudart)
165+
endif()
166+
target_include_directories(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR}
167+
PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
168+
169+
# ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
170+
set_target_properties(onnxruntime_providers_nv_tensorrt_rtx PROPERTIES LINKER_LANGUAGE CUDA)
171+
set_target_properties(onnxruntime_providers_nv_tensorrt_rtx PROPERTIES FOLDER "ONNXRuntime")
172+
target_compile_definitions(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ONNXIFI_BUILD_LIBRARY=1)
173+
target_compile_options(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${DISABLED_WARNINGS_FOR_TRT})
174+
if (WIN32)
175+
target_compile_options(onnxruntime_providers_nv_tensorrt_rtx INTERFACE /wd4456)
176+
endif()
177+
# set CUDA_MINIMAL as default for NV provider since we do not have fallback to CUDA
178+
target_compile_definitions(onnxruntime_providers_nv_tensorrt_rtx PRIVATE USE_CUDA_MINIMAL=1)
179+
180+
# Needed for the provider interface, as it includes training headers when training is enabled
181+
if (onnxruntime_ENABLE_TRAINING_OPS)
182+
target_include_directories(onnxruntime_providers_nv_tensorrt_rtx PRIVATE ${ORTTRAINING_ROOT})
183+
if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
184+
onnxruntime_add_include_to_target(onnxruntime_providers_nv_tensorrt_rtx Python::Module)
185+
endif()
186+
endif()
187+
188+
if(APPLE)
189+
set_property(TARGET onnxruntime_providers_nv_tensorrt_rtx APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/nv_tensorrt_rtx/exported_symbols.lst")
190+
elseif(UNIX)
191+
set_property(TARGET onnxruntime_providers_nv_tensorrt_rtx APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
192+
set_property(TARGET onnxruntime_providers_nv_tensorrt_rtx APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/nv_tensorrt_rtx/version_script.lds -Xlinker --gc-sections")
193+
elseif(WIN32)
194+
set_property(TARGET onnxruntime_providers_nv_tensorrt_rtx APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/nv_tensorrt_rtx/symbols.def")
195+
else()
196+
message(FATAL_ERROR "onnxruntime_providers_nv_tensorrt_rtx unknown platform, need to specify shared library exports for it")
197+
endif()
198+
199+
install(TARGETS onnxruntime_providers_nv_tensorrt_rtx
200+
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
201+
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
202+
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})

cmake/onnxruntime_python.cmake

+10
Original file line numberDiff line numberDiff line change
@@ -928,6 +928,16 @@ if (onnxruntime_USE_TENSORRT)
928928
)
929929
endif()
930930

931+
if (onnxruntime_USE_NV)
932+
add_custom_command(
933+
TARGET onnxruntime_pybind11_state POST_BUILD
934+
COMMAND ${CMAKE_COMMAND} -E copy
935+
$<TARGET_FILE:onnxruntime_providers_nv_tensorrt_rtx>
936+
$<TARGET_FILE:onnxruntime_providers_shared>
937+
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
938+
)
939+
endif()
940+
931941
if (onnxruntime_USE_MIGRAPHX)
932942
add_custom_command(
933943
TARGET onnxruntime_pybind11_state POST_BUILD

0 commit comments

Comments
 (0)