Skip to content

Commit 605bba0

Browse files
committed
bug fixes and support cpp runtime
1 parent e7a0faf commit 605bba0

File tree

4 files changed

+133
-58
lines changed

4 files changed

+133
-58
lines changed

core/runtime/TRTEngine.cpp

+21-2
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,27 @@ std::vector<std::string> split(const std::string& str, char delim) {
3030
return strings;
3131
}
3232

33+
DynamicOutputAllocator::DynamicOutputAllocator(const std::unordered_map<std::string, at::ScalarType>& output_dtypes)
34+
: dtypes(output_dtypes) {}
35+
36+
void* DynamicOutputAllocator::reallocateOutputAsync(
37+
char const* tensorName,
38+
void* currentMemory,
39+
uint64_t size,
40+
uint64_t alignment,
41+
cudaStream_t stream) {
42+
std::vector<int64_t> shape = {static_cast<int64_t>(size)};
43+
auto it = buffers.find(tensorName);
44+
if (it == buffers.end() || it->second.sizes() != shape) {
45+
buffers[tensorName] = at::empty(shape, at::TensorOptions().dtype(dtypes.at(tensorName)).device(c10::kCUDA));
46+
}
47+
return buffers[tensorName].data_ptr();
48+
}
49+
50+
void DynamicOutputAllocator::notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept {
51+
shapes[tensorName] = dims;
52+
}
53+
3354
TRTEngine::TRTEngine(
3455
const std::string& serialized_engine,
3556
const RTDevice& cuda_device,
@@ -137,7 +158,6 @@ TRTEngine::TRTEngine(
137158
in_binding_names.resize(inputs);
138159
input_buffers.resize(inputs);
139160
out_binding_names.resize(outputs);
140-
output_buffers.resize(outputs);
141161
for (int64_t x = 0; x < cuda_engine->getNbIOTensors(); x++) {
142162
std::string bind_name = cuda_engine->getIOTensorName(x);
143163
if (cuda_engine->getTensorIOMode(bind_name.c_str()) == nvinfer1::TensorIOMode::kINPUT) {
@@ -179,7 +199,6 @@ TRTEngine::TRTEngine(
179199

180200
uint64_t outputs = _out_binding_names.size();
181201
out_binding_names.resize(outputs);
182-
output_buffers.resize(outputs);
183202
for (size_t pyt_idx = 0; pyt_idx < outputs; pyt_idx++) {
184203
auto binding_name = _out_binding_names[pyt_idx];
185204
// Check if the binding name provided is in the list of engine's bindings

core/runtime/TRTEngine.h

+28-1
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,39 @@ struct TorchTRTRuntimeStates {
6969
}
7070
};
7171

72+
class DynamicOutputAllocator : public nvinfer1::IOutputAllocator {
73+
public:
74+
DynamicOutputAllocator(const std::unordered_map<std::string, at::ScalarType>& output_dtypes);
75+
76+
void* reallocateOutputAsync(
77+
char const* tensorName,
78+
void* currentMemory,
79+
uint64_t size,
80+
uint64_t alignment,
81+
cudaStream_t stream) override;
82+
83+
void notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept override;
84+
85+
const std::unordered_map<std::string, at::Tensor>& getBuffers() const {
86+
return buffers;
87+
}
88+
89+
const std::unordered_map<std::string, nvinfer1::Dims>& getShapes() const {
90+
return shapes;
91+
}
92+
93+
private:
94+
std::unordered_map<std::string, at::ScalarType> dtypes;
95+
std::unordered_map<std::string, at::Tensor> buffers;
96+
std::unordered_map<std::string, nvinfer1::Dims> shapes;
97+
};
98+
7299
struct TRTEngine : torch::CustomClassHolder {
73100
// Each engine needs it's own runtime object
74101
std::shared_ptr<nvinfer1::IRuntime> rt;
75102
std::shared_ptr<nvinfer1::ICudaEngine> cuda_engine;
76103
std::shared_ptr<nvinfer1::IExecutionContext> exec_ctx;
104+
std::shared_ptr<DynamicOutputAllocator> output_allocator;
77105
std::pair<uint64_t, uint64_t> num_io;
78106
std::string name;
79107
RTDevice device_info;
@@ -141,7 +169,6 @@ struct TRTEngine : torch::CustomClassHolder {
141169
at::cuda::CUDAStream engine_stream = c10::cuda::getDefaultCUDAStream();
142170
at::cuda::CUDAStream caller_stream = c10::cuda::getDefaultCUDAStream();
143171
std::vector<at::Tensor> input_buffers = {};
144-
std::vector<at::Tensor> output_buffers = {};
145172
std::string shape_key = "None";
146173
bool use_pre_allocated_outputs = false;
147174
std::vector<at::Tensor> pre_allocated_outputs;

core/runtime/execute_engine.cpp

+41-54
Original file line numberDiff line numberDiff line change
@@ -163,22 +163,23 @@ void setup_input_tensors(
163163
}
164164
}
165165
}
166-
std::vector<at::Tensor> create_output_tensors(c10::intrusive_ptr<TRTEngine> compiled_engine) {
167-
std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
168-
for (auto output_indices : compiled_engine->out_binding_map) {
169-
// out_binding_map stores TRT_IDX: PYT_IDX
170-
auto pyt_idx = output_indices.second;
171-
172-
std::string name = compiled_engine->out_binding_names[pyt_idx];
173-
auto out_shape = compiled_engine->exec_ctx->getTensorShape(name.c_str());
174-
LOG_DEBUG("Output Name: " << name << " Shape: " << out_shape);
175-
176-
auto dims = core::util::toVec(out_shape);
177-
auto type = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
178-
outputs[pyt_idx] = std::move(at::empty(dims, {at::kCUDA}).to(type).contiguous());
166+
167+
void setup_output_allocator(c10::intrusive_ptr<TRTEngine> compiled_engine) {
168+
if (compiled_engine->output_allocator == nullptr) {
169+
std::unordered_map<std::string, at::ScalarType> output_dtypes_dict;
170+
for (size_t o = 0; o < compiled_engine->out_binding_names.size(); ++o) {
171+
auto name = compiled_engine->out_binding_names[o];
172+
output_dtypes_dict[name] =
173+
util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
174+
}
175+
compiled_engine->output_allocator = std::make_shared<DynamicOutputAllocator>(output_dtypes_dict);
179176
}
180177

181-
return outputs;
178+
for (const auto& output_name : compiled_engine->out_binding_names) {
179+
if (!compiled_engine->exec_ctx->setOutputAllocator(output_name.c_str(), compiled_engine->output_allocator.get())) {
180+
throw std::runtime_error("Failed to set output allocator for " + output_name);
181+
}
182+
}
182183
}
183184

184185
std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
@@ -218,7 +219,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
218219
}
219220

220221
// Intialize inputs and outputs to be available throughout the succeeding scopes
221-
std::vector<at::Tensor> outputs(compiled_engine->num_io.second);
222222

223223
if (MULTI_DEVICE_SAFE_MODE) {
224224
std::unique_ptr<torch::autograd::profiler::RecordProfile> device_profiler_guard;
@@ -287,44 +287,20 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
287287
<< " cannot be inferred. This could happen if the input tensor addresses/shapes haven't been configured correctly");
288288
}
289289

290-
{ // Output Setup
291-
std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
290+
{ // OutputAllocator Setup
291+
std::unique_ptr<torch::autograd::profiler::RecordProfile> output_allocator_profiler_guard;
292292
if (compiled_engine->profile_execution) {
293-
output_profiler_guard =
293+
output_allocator_profiler_guard =
294294
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
295295
}
296-
if (can_use_pre_allocated_outputs) {
297-
outputs = compiled_engine->pre_allocated_outputs;
298-
} else {
299-
outputs = create_output_tensors(compiled_engine);
300-
}
301-
302-
for (auto output_indices : compiled_engine->out_binding_map) {
303-
auto pyt_idx = output_indices.second;
304-
std::string name = compiled_engine->out_binding_names[pyt_idx];
305-
if (need_cudagraphs_record) {
306-
// If we are recording the cuda graph then we need to update the persistent output buffer
307-
compiled_engine->output_buffers[pyt_idx] = std::move(outputs[pyt_idx].clone());
308-
}
309-
310-
if (cudagraphs_enabled) {
311-
TORCHTRT_CHECK(
312-
compiled_engine->exec_ctx->setTensorAddress(
313-
name.c_str(), compiled_engine->output_buffers[pyt_idx].data_ptr()),
314-
"Error while setting the output tensor address");
315-
} else {
316-
TORCHTRT_CHECK(
317-
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), outputs[pyt_idx].data_ptr()),
318-
"Error while setting the output tensor address");
319-
}
320-
}
296+
setup_output_allocator(compiled_engine);
321297
}
322298

323299
auto current_device_id = -1;
324300
if (inputs.size() > 0) {
325301
current_device_id = inputs[0].device().index(); // Done this way to avoid a call to cudart
326-
} else if (outputs.size() > 0) {
327-
current_device_id = outputs[0].device().index(); // Done this way to avoid a call to cudart
302+
} else {
303+
current_device_id = c10::cuda::current_device();
328304
}
329305

330306
compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream(current_device_id);
@@ -368,21 +344,32 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
368344
}
369345
} // End engine exeuction (resets to caller stream)
370346

371-
// Create output buffer for next execution of graph or trt context.
372-
if (compiled_engine->use_pre_allocated_outputs) {
373-
compiled_engine->pre_allocated_outputs = create_output_tensors(compiled_engine);
374-
}
375-
376347
// Block caller stream until engine execution is complete
377348
at::cuda::CUDAEvent trt_exec_complete;
378349
trt_exec_complete.record(compiled_engine->engine_stream);
379350
trt_exec_complete.block(compiled_engine->caller_stream);
380351

381-
if (cudagraphs_enabled) {
382-
// If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
383-
for (size_t o = 0; o < compiled_engine->output_buffers.size(); o++) {
384-
outputs[o].copy_(compiled_engine->output_buffers[o], false);
352+
std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
353+
if (compiled_engine->profile_execution) {
354+
output_profiler_guard =
355+
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path);
356+
}
357+
std::vector<at::Tensor> outputs;
358+
for (size_t i = 0; i < compiled_engine->out_binding_names.size(); i++) {
359+
auto name = compiled_engine->out_binding_names[i];
360+
auto dims = compiled_engine->output_allocator->getShapes().at(name);
361+
auto dtype = util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
362+
at::Tensor output = compiled_engine->output_allocator->getBuffers().at(name).clone().detach();
363+
int64_t prod = 1;
364+
for (int i = 0; i < dims.nbDims; ++i) {
365+
prod *= dims.d[i];
366+
}
367+
std::vector<int64_t> dims_vec(dims.nbDims);
368+
for (int i = 0; i < dims.nbDims; ++i) {
369+
dims_vec[i] = dims.d[i];
385370
}
371+
output = output.reshape(-1).view(dtype).slice(0, 0, prod).reshape(dims_vec);
372+
outputs.push_back(output);
386373
}
387374

388375
if (compiled_engine->profile_execution) {

tests/py/dynamo/conversion/test_nonzero_aten.py

+43-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import torch.nn as nn
33
from parameterized import parameterized
44
from torch.testing._internal.common_utils import run_tests
5+
from torch_tensorrt import Input
56

67
from .harness import DispatchTestCase
78

@@ -16,7 +17,7 @@ class TestNonZeroConverter(DispatchTestCase):
1617
((2, 3, 4, 5), torch.float),
1718
]
1819
)
19-
def test_non_zero_float(self, input_shape, dtype):
20+
def test_non_zero(self, input_shape, dtype):
2021
class NonZero(nn.Module):
2122
def forward(self, input):
2223
return torch.ops.aten.nonzero.default(input)
@@ -27,6 +28,47 @@ def forward(self, input):
2728
inputs,
2829
)
2930

31+
@parameterized.expand(
32+
[
33+
(
34+
"1d",
35+
(1,),
36+
(10,),
37+
(100,),
38+
torch.int32,
39+
),
40+
(
41+
"2d",
42+
(1, 2),
43+
(5, 10),
44+
(20, 40),
45+
torch.float16,
46+
),
47+
(
48+
"3d",
49+
(1, 2, 3),
50+
(5, 10, 20),
51+
(30, 40, 50),
52+
torch.float,
53+
),
54+
]
55+
)
56+
def test_nonzero_dynamic_shape(self, _, min_shape, opt_shape, max_shape, dtype):
57+
class NonZero(nn.Module):
58+
def forward(self, input):
59+
return torch.ops.aten.nonzero.default(input)
60+
61+
input_specs = [
62+
Input(
63+
min_shape=min_shape,
64+
opt_shape=opt_shape,
65+
max_shape=max_shape,
66+
dtype=dtype,
67+
),
68+
]
69+
70+
self.run_test_with_dynamic_shape(NonZero(), input_specs)
71+
3072

3173
if __name__ == "__main__":
3274
run_tests()

0 commit comments

Comments
 (0)