@@ -163,22 +163,23 @@ void setup_input_tensors(
163
163
}
164
164
}
165
165
}
166
- std::vector<at::Tensor> create_output_tensors (c10::intrusive_ptr<TRTEngine> compiled_engine) {
167
- std::vector<at::Tensor> outputs (compiled_engine->num_io .second );
168
- for (auto output_indices : compiled_engine->out_binding_map ) {
169
- // out_binding_map stores TRT_IDX: PYT_IDX
170
- auto pyt_idx = output_indices.second ;
171
-
172
- std::string name = compiled_engine->out_binding_names [pyt_idx];
173
- auto out_shape = compiled_engine->exec_ctx ->getTensorShape (name.c_str ());
174
- LOG_DEBUG (" Output Name: " << name << " Shape: " << out_shape);
175
-
176
- auto dims = core::util::toVec (out_shape);
177
- auto type = util::TRTDataTypeToScalarType (compiled_engine->exec_ctx ->getEngine ().getTensorDataType (name.c_str ()));
178
- outputs[pyt_idx] = std::move (at::empty (dims, {at::kCUDA }).to (type).contiguous ());
166
+
167
+ void setup_output_allocator (c10::intrusive_ptr<TRTEngine> compiled_engine) {
168
+ if (compiled_engine->output_allocator == nullptr ) {
169
+ std::unordered_map<std::string, at::ScalarType> output_dtypes_dict;
170
+ for (size_t o = 0 ; o < compiled_engine->out_binding_names .size (); ++o) {
171
+ auto name = compiled_engine->out_binding_names [o];
172
+ output_dtypes_dict[name] =
173
+ util::TRTDataTypeToScalarType (compiled_engine->exec_ctx ->getEngine ().getTensorDataType (name.c_str ()));
174
+ }
175
+ compiled_engine->output_allocator = std::make_shared<DynamicOutputAllocator>(output_dtypes_dict);
179
176
}
180
177
181
- return outputs;
178
+ for (const auto & output_name : compiled_engine->out_binding_names ) {
179
+ if (!compiled_engine->exec_ctx ->setOutputAllocator (output_name.c_str (), compiled_engine->output_allocator .get ())) {
180
+ throw std::runtime_error (" Failed to set output allocator for " + output_name);
181
+ }
182
+ }
182
183
}
183
184
184
185
std::vector<at::Tensor> execute_engine (std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
@@ -218,7 +219,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
218
219
}
219
220
220
221
// Intialize inputs and outputs to be available throughout the succeeding scopes
221
- std::vector<at::Tensor> outputs (compiled_engine->num_io .second );
222
222
223
223
if (MULTI_DEVICE_SAFE_MODE) {
224
224
std::unique_ptr<torch::autograd::profiler::RecordProfile> device_profiler_guard;
@@ -287,44 +287,20 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
287
287
<< " cannot be inferred. This could happen if the input tensor addresses/shapes haven't been configured correctly" );
288
288
}
289
289
290
- { // Output Setup
291
- std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard ;
290
+ { // OutputAllocator Setup
291
+ std::unique_ptr<torch::autograd::profiler::RecordProfile> output_allocator_profiler_guard ;
292
292
if (compiled_engine->profile_execution ) {
293
- output_profiler_guard =
293
+ output_allocator_profiler_guard =
294
294
std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path );
295
295
}
296
- if (can_use_pre_allocated_outputs) {
297
- outputs = compiled_engine->pre_allocated_outputs ;
298
- } else {
299
- outputs = create_output_tensors (compiled_engine);
300
- }
301
-
302
- for (auto output_indices : compiled_engine->out_binding_map ) {
303
- auto pyt_idx = output_indices.second ;
304
- std::string name = compiled_engine->out_binding_names [pyt_idx];
305
- if (need_cudagraphs_record) {
306
- // If we are recording the cuda graph then we need to update the persistent output buffer
307
- compiled_engine->output_buffers [pyt_idx] = std::move (outputs[pyt_idx].clone ());
308
- }
309
-
310
- if (cudagraphs_enabled) {
311
- TORCHTRT_CHECK (
312
- compiled_engine->exec_ctx ->setTensorAddress (
313
- name.c_str (), compiled_engine->output_buffers [pyt_idx].data_ptr ()),
314
- " Error while setting the output tensor address" );
315
- } else {
316
- TORCHTRT_CHECK (
317
- compiled_engine->exec_ctx ->setTensorAddress (name.c_str (), outputs[pyt_idx].data_ptr ()),
318
- " Error while setting the output tensor address" );
319
- }
320
- }
296
+ setup_output_allocator (compiled_engine);
321
297
}
322
298
323
299
auto current_device_id = -1 ;
324
300
if (inputs.size () > 0 ) {
325
301
current_device_id = inputs[0 ].device ().index (); // Done this way to avoid a call to cudart
326
- } else if (outputs. size () > 0 ) {
327
- current_device_id = outputs[ 0 ]. device (). index (); // Done this way to avoid a call to cudart
302
+ } else {
303
+ current_device_id = c10::cuda::current_device ();
328
304
}
329
305
330
306
compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream (current_device_id);
@@ -368,21 +344,32 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
368
344
}
369
345
} // End engine exeuction (resets to caller stream)
370
346
371
- // Create output buffer for next execution of graph or trt context.
372
- if (compiled_engine->use_pre_allocated_outputs ) {
373
- compiled_engine->pre_allocated_outputs = create_output_tensors (compiled_engine);
374
- }
375
-
376
347
// Block caller stream until engine execution is complete
377
348
at::cuda::CUDAEvent trt_exec_complete;
378
349
trt_exec_complete.record (compiled_engine->engine_stream );
379
350
trt_exec_complete.block (compiled_engine->caller_stream );
380
351
381
- if (cudagraphs_enabled) {
382
- // If in CUDAGraph mode, results need to be copied to the result buffers (on caller stream)
383
- for (size_t o = 0 ; o < compiled_engine->output_buffers .size (); o++) {
384
- outputs[o].copy_ (compiled_engine->output_buffers [o], false );
352
+ std::unique_ptr<torch::autograd::profiler::RecordProfile> output_profiler_guard;
353
+ if (compiled_engine->profile_execution ) {
354
+ output_profiler_guard =
355
+ std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->output_profile_path );
356
+ }
357
+ std::vector<at::Tensor> outputs;
358
+ for (size_t i = 0 ; i < compiled_engine->out_binding_names .size (); i++) {
359
+ auto name = compiled_engine->out_binding_names [i];
360
+ auto dims = compiled_engine->output_allocator ->getShapes ().at (name);
361
+ auto dtype = util::TRTDataTypeToScalarType (compiled_engine->exec_ctx ->getEngine ().getTensorDataType (name.c_str ()));
362
+ at::Tensor output = compiled_engine->output_allocator ->getBuffers ().at (name).clone ().detach ();
363
+ int64_t prod = 1 ;
364
+ for (int i = 0 ; i < dims.nbDims ; ++i) {
365
+ prod *= dims.d [i];
366
+ }
367
+ std::vector<int64_t > dims_vec (dims.nbDims );
368
+ for (int i = 0 ; i < dims.nbDims ; ++i) {
369
+ dims_vec[i] = dims.d [i];
385
370
}
371
+ output = output.reshape (-1 ).view (dtype).slice (0 , 0 , prod).reshape (dims_vec);
372
+ outputs.push_back (output);
386
373
}
387
374
388
375
if (compiled_engine->profile_execution ) {
0 commit comments