onnx-tensorrt/onnx_tensorrt/backend.py at 3e06d89c23074af53839ae2b728a085b8be11afd · onnx/onnx-tensorrt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
# SPDX-License-Identifier: Apache-2.0

from __future__ import print_function
from .tensorrt_engine import Engine
import tensorrt as trt
from onnx.backend.base import Backend, BackendRep, Device, DeviceType, namedtupledict
import onnx
from onnx import helper as onnx_helper
from onnx import numpy_helper
import numpy as np
import six

# HACK Should look for a better way/place to do this
from ctypes import cdll, c_char_p
libcudart = cdll.LoadLibrary('libcudart.so')
libcudart.cudaGetErrorString.restype = c_char_p
def cudaSetDevice(device_idx):
    ret = libcudart.cudaSetDevice(device_idx)
    if ret != 0:
        error_string = libcudart.cudaGetErrorString(ret)
        if isinstance(error_string, bytes):
            error_string = error_string.decode("utf-8")
        raise RuntimeError("cudaSetDevice: " + error_string)

def count_trailing_ones(vals):
    count = 0
    for val in reversed(vals):
        if val != 1:
            return count
        count += 1
    return count

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

class TensorRTBackendRep(BackendRep):
    def __init__(self, model, device,
            max_workspace_size=None, serialize_engine=False, verbose=False, explicit_batchsize=False,**kwargs):
        if not isinstance(device, Device):
            device = Device(device)
        self._set_device(device)
        self._logger = TRT_LOGGER
        self.builder = trt.Builder(self._logger)
        self.network = self.builder.create_network(flags=int(explicit_batchsize))
        self.parser = trt.OnnxParser(self.network, self._logger)
        self.config = self.builder.create_builder_config()
        self.serialize_engine = serialize_engine
        self.verbose = verbose
        self.dynamic = False

        if self.verbose:
            print(f'\nRunning {model.graph.name}...')
            TRT_LOGGER.min_severity = trt.Logger.VERBOSE

        if not isinstance(model, six.string_types):
            model_str = model.SerializeToString()
        else:
            model_str = model

        if not trt.init_libnvinfer_plugins(TRT_LOGGER, ""):
            msg = "Failed to initialize TensorRT's plugin library."
            raise RuntimeError(msg)

        if not self.parser.parse(model_str):
            error = self.parser.get_error(0)
            msg = "While parsing node number %i:\n" % error.node()
            msg += ("%s:%i In function %s:\n[%i] %s" %
                    (error.file(), error.line(), error.func(),
                     error.code(), error.desc()))
            raise RuntimeError(msg)
        if max_workspace_size is None:
            max_workspace_size = 1 << 28

        self.config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, max_workspace_size)

        num_inputs = self.network.num_inputs
        for idx in range(num_inputs):
            inp_tensor = self.network.get_input(idx)
            if inp_tensor.is_shape_tensor or -1 in inp_tensor.shape:
                self.dynamic = True
                break

        if self.verbose:
            for layer in self.network:
                print(layer)

            print(f'Output shape: {self.network[-1].get_output(0).shape}')

        if self.dynamic:
            if self.verbose:
                print("Found dynamic inputs! Deferring engine build to run stage")
        else:
            self._build_engine()

        self._output_shapes = {}
        self._output_dtype = {}
        for output in model.graph.output:
            dims = output.type.tensor_type.shape.dim
            output_shape = tuple([dim.dim_value for dim in dims])
            self._output_shapes[output.name] = output_shape
            self._output_dtype[output.name] = output.type.tensor_type.elem_type

    def _build_engine(self, inputs=None):
        """
        Builds TensorRT Engine, with BuilderConfig if needed
        :param inputs: inputs to the model; if not None, this means we are building the engine at run time,
                       because we need to register optimization profiles for some inputs
        :type inputs: List of np.ndarray
        """

        if inputs:
            opt_profile = self.builder.create_optimization_profile()

            # Set optimization profiles for the input bindings that need them
            for i in range(self.network.num_inputs):
                inp_tensor = self.network.get_input(i)
                name = inp_tensor.name
                # Set profiles for shape tensors
                if inp_tensor.is_shape_tensor:
                    if inputs[i].ndim > 0:
                        val_list = inputs[i].tolist()
                        opt_profile.set_shape_input(name, val_list, val_list, val_list)
                    else:
                        opt_profile.set_shape_input(name, [inputs[i]], [inputs[i]], [inputs[i]])
                # Set profiles for dynamic execution tensors
                elif -1 in inp_tensor.shape:
                    opt_profile.set_shape(name, inputs[i].shape, inputs[i].shape, inputs[i].shape)

            self.config.add_optimization_profile(opt_profile)
        trt_blob = self.builder.build_serialized_network(self.network, self.config)

        if trt_blob is None:
            raise RuntimeError("Failed to build TensorRT engine from network")

        trt_engine = self._deserialize(trt_blob)
        self.engine = Engine(trt_engine)

    def _set_device(self, device):
        self.device = device
        assert(device.type == DeviceType.CUDA)
        cudaSetDevice(device.device_id)

    def _deserialize(self, trt_blob):
        self.runtime = trt.Runtime(TRT_LOGGER)
        del self.parser # Parser no longer needed for ownership of plugins
        trt_engine = self.runtime.deserialize_cuda_engine(trt_blob)
        return trt_engine

    def run(self, inputs, **kwargs):
        """Execute the prepared engine and return the outputs as a named tuple.
        inputs -- Input tensor(s) as a Numpy array or list of Numpy arrays.
        """
        if isinstance(inputs, np.ndarray):
            inputs = [inputs]

        if self.dynamic:
            self._build_engine(inputs)

        outputs = self.engine.run(inputs)
        output_names = [output.name for output in self.engine.outputs]

        for i, (name, array) in enumerate(zip(output_names, outputs)):
            output_shape = self._output_shapes[name]
            # HACK WAR for unknown output shape in run_node
            if output_shape == (-99,):
                # WAR for TRT requiring at least 2 dims (NC)
                min_dims = 2
                if _tensorrt_version()[0] < 4:
                    # WAR for TRT only supporting 4D (NCHW) tensors
                    min_dims = 4
                if array.ndim == min_dims:
                    npadding_dims = count_trailing_ones(array.shape)
                    if npadding_dims > 0:
                        outputs[i] = array.reshape(
                            array.shape[:-npadding_dims])
            else:
                # HACK WAR replace fixed batch dim with variable
                if self._output_dtype[name] == onnx.TensorProto.INT64 and array.dtype == np.int32:
                    casted_output = np.array(outputs[i], dtype=np.int64)
                    if np.equal(outputs[i], casted_output).all():
                        outputs[i] = np.array(outputs[i], dtype=np.int64)
                if self._output_dtype[name] == onnx.TensorProto.DOUBLE and array.dtype == np.float32:
                    casted_output = np.array(outputs[i], dtype=np.double)
                    if np.equal(outputs[i], casted_output).all():
                        outputs[i] = np.array(outputs[i], dtype=np.double)

        outputs_tuple = namedtupledict('Outputs', output_names)(*outputs)
        return namedtupledict('Outputs', output_names)(*outputs)

def np2onnx_dtype(np_dtype):
    if np_dtype == np.dtype('float32'):
        return onnx.TensorProto.FLOAT
    elif np_dtype == np.dtype('float16'):
        return onnx.TensorProto.FLOAT16
    elif np_dtype == np.dtype('int64'):
        return onnx.TensorProto.INT64
    elif np_dtype == np.dtype('int32'):
        return onnx.TensorProto.INT32
    elif np_dtype == np.dtype('int8'):
        return onnx.TensorProto.INT8
    elif np_dtype == np.dtype('double'):
        return onnx.TensorProto.DOUBLE
    else:
        raise TypeError("Unsupported data type:", np_dtype)

def make_node_test_model(node, inputs, use_weights=True):
    # HACK TODO: The output info is unknown here; not sure what the best solution is
    output_dtype = np.float32 # Dummy value only
    output_shape = [-99]      # Dummy value only
    graph_inputs = [onnx_helper.make_tensor_value_info(
        name, np2onnx_dtype(array.dtype), array.shape)
                    for name, array in zip(node.input, inputs)]
    graph_outputs = [onnx_helper.make_tensor_value_info(
        name, np2onnx_dtype(output_dtype), output_shape)
                     for name in node.output]
    if use_weights:
        # Add initializers for all inputs except the first
        initializers = [onnx_helper.make_tensor(
            name, np2onnx_dtype(array.dtype), array.shape, array.flatten().tolist())
                        for name, array in zip(node.input[1:], inputs[1:])]
    else:
        initializers = []
    graph = onnx_helper.make_graph(
           [node], "RunNodeGraph_" + node.op_type,
           graph_inputs, graph_outputs, initializer=initializers)
    model = onnx_helper.make_model(graph)
    return model

class TensorRTBackend(Backend):
    @classmethod
    def prepare(cls, model, device='CUDA:0', **kwargs):
        """Build an engine from the given model.
        model -- An ONNX model as a deserialized protobuf, or a string or file-
                 object containing a serialized protobuf.
        """
        super(TensorRTBackend, cls).prepare(model, device, **kwargs)
        return TensorRTBackendRep(model, device, **kwargs)
    @classmethod
    def run_model(cls, model, inputs, device='CUDA:0', **kwargs):
        """Build and run an engine from the given model.
        model -- An ONNX model as a deserialized protobuf, or a string or file-
                 object containing a serialized protobuf.
        inputs -- Input tensor(s) as a Numpy array or list of Numpy arrays.
        """
        return cls.prepare(model, device, **kwargs).run(inputs)
    @classmethod
    def run_node(cls, node, inputs, device='CUDA:0'):
        """Build and run an engine from the given node.
        node -- An ONNX node as a deserialized protobuf.
        Note: This function is intended for testing purposes only;
              use prepare() or run_model() for other purposes.
        """
        super(TensorRTBackend, cls).run_node(node, inputs, device)
        # HACK TODO: This is somewhat dodgy. We first try with weights for all
        #            inputs but the first, then we try again with no weights if
        #            the first try fails.
        model = make_node_test_model(node, inputs, use_weights=True)
        try: results = TensorRTBackend.prepare(model, device).run(inputs[:1])
        except RuntimeError:
            model = make_node_test_model(node, inputs, use_weights=False)
            results = TensorRTBackend.prepare(model, device).run(inputs)
        return results
    @classmethod
    def supports_device(cls, device_str):
        device = Device(device_str)
        return device.type == DeviceType.CUDA

prepare         = TensorRTBackend.prepare
run_node        = TensorRTBackend.run_node
run_model       = TensorRTBackend.run_model
supports_device = TensorRTBackend.supports_device