@@ -125,7 +125,7 @@ def forward(self, x):
125125 )
126126 torch ._dynamo .reset ()
127127
128- def test_pre_allocated_outputs_unowned_outputs (self ):
128+ def test_pre_allocated_outputs_unowned_outputs_py_api_check_no_realloc (self ):
129129 class SampleModel (torch .nn .Module ):
130130 def forward (self , x ):
131131 return torch .softmax (x * 7 + 2 , dim = 0 )
@@ -146,21 +146,247 @@ def forward(self, x):
146146 )
147147
148148 with torchtrt .runtime .enable_pre_allocated_outputs (optimized_model ):
149- optimized_model (inputs [0 ])
149+ _ = optimized_model (inputs [0 ])
150150 output_tensors = [
151151 trt_mod .pre_allocated_outputs
152152 for name , trt_mod in optimized_model .named_children ()
153153 if "_run_on_acc" in name
154154 ]
155- optimized_model (inputs [0 ])
155+ _ = optimized_model (inputs [0 ])
156156 new_output_tensors = [
157157 trt_mod .pre_allocated_outputs
158158 for name , trt_mod in optimized_model .named_children ()
159159 if "_run_on_acc" in name
160160 ]
161+
162+ # Run to run, output of intermediate engine is not reallocated
161163 self .assertTrue (output_tensors [0 ] is new_output_tensors [0 ])
164+ # Run to run, output of output engine is reallocated
162165 self .assertTrue (output_tensors [1 ] is not new_output_tensors [1 ])
163166
167+ @parameterized .expand (
168+ [
169+ ("python_runtime" , True ),
170+ ("cpp_runtime" , False ),
171+ ]
172+ )
173+ def test_pre_allocated_outputs_unowned_outputs_api_check (self , _ , use_python_runtime ):
174+ class SampleModel (torch .nn .Module ):
175+ def forward (self , x ):
176+ return torch .softmax (x * 7 + 2 , dim = 0 )
177+
178+ model = SampleModel ().eval ().cuda ()
179+ inputs = [torch .randn (* INPUT_SIZE ).cuda () for _ in range (TRIALS )]
180+ fx_graph = torch .fx .symbolic_trace (model )
181+
182+ # Validate that the results between Torch and Torch-TRT are similar
183+ optimized_model = torchtrt .compile (
184+ fx_graph ,
185+ "dynamo" ,
186+ inputs [0 ],
187+ min_block_size = 1 ,
188+ pass_through_build_failures = True ,
189+ use_python_runtime = use_python_runtime ,
190+ torch_executed_ops = {torch .ops .aten .add .Tensor },
191+ )
192+
193+ with torchtrt .runtime .enable_pre_allocated_outputs (optimized_model ):
194+ _ = optimized_model (inputs [0 ])
195+ if use_python_runtime :
196+ self .assertTrue (all (seen == expected for seen , expected in zip ([
197+ optimized_model ._run_on_acc_0 .are_output_tensors_unowned (),
198+ optimized_model ._run_on_acc_2 .are_output_tensors_unowned ()
199+ ], [False , True ])))
200+
201+ else :
202+ self .assertTrue (all (seen == expected for seen , expected in zip ([
203+ optimized_model ._run_on_acc_0 .engine .are_output_tensors_unowned (),
204+ optimized_model ._run_on_acc_2 .engine .are_output_tensors_unowned ()
205+ ], [False , True ])))
206+
207+ @parameterized .expand (
208+ [
209+ ("python_runtime" , True ),
210+ ("cpp_runtime" , False ),
211+ ]
212+ )
213+ def test_pre_allocated_outputs_unowned_outputs (self , _ , use_python_runtime ):
214+ class SampleModel (torch .nn .Module ):
215+ def forward (self , x ):
216+ return torch .softmax (x * 7 + 2 , dim = 0 )
217+
218+ model = SampleModel ().eval ().cuda ()
219+ inputs = [torch .randn (* INPUT_SIZE ).cuda () for _ in range (TRIALS )]
220+ fx_graph = torch .fx .symbolic_trace (model )
221+
222+ # Validate that the results between Torch and Torch-TRT are similar
223+ optimized_model = torchtrt .compile (
224+ fx_graph ,
225+ "dynamo" ,
226+ inputs [0 ],
227+ min_block_size = 1 ,
228+ pass_through_build_failures = True ,
229+ use_python_runtime = use_python_runtime ,
230+ torch_executed_ops = {torch .ops .aten .add .Tensor },
231+ )
232+
233+ torch_res = model (inputs [0 ])
234+
235+ with torchtrt .runtime .enable_pre_allocated_outputs (optimized_model ):
236+ res_1 = optimized_model (inputs [0 ])
237+ res_2 = optimized_model (inputs [0 ])
238+
239+ # Results are correct
240+ torch .testing .assert_close (
241+ torch_res ,
242+ res_1 ,
243+ rtol = 5e-03 ,
244+ atol = 5e-03 ,
245+ equal_nan = True ,
246+ check_dtype = True ,
247+ )
248+
249+ # Results between runs are identical
250+ torch .testing .assert_close (
251+ res_1 ,
252+ res_2 ,
253+ rtol = 5e-03 ,
254+ atol = 5e-03 ,
255+ equal_nan = True ,
256+ check_dtype = True ,
257+ )
258+
259+ torch ._dynamo .reset ()
260+
261+
262+ def test_pre_allocated_outputs_unowned_outputs_multiple_outputs_py_api_check_no_realloc (self ):
263+ class SampleModel (torch .nn .Module ):
264+ def forward (self , x ):
265+ y = torch .ops .aten .mul (x , 7 )
266+ z = torch .ops .aten .add (y , 2 )
267+ a = torch .ops .aten .softmax (z , dim = 0 )
268+ return y , z , a
269+
270+ model = SampleModel ().eval ().cuda ()
271+ inputs = [torch .randn (* INPUT_SIZE ).cuda () for _ in range (TRIALS )]
272+ fx_graph = torch .fx .symbolic_trace (model )
273+
274+ # Validate that the results between Torch and Torch-TRT are similar
275+ optimized_model = torchtrt .compile (
276+ fx_graph ,
277+ "dynamo" ,
278+ inputs [0 ],
279+ min_block_size = 1 ,
280+ pass_through_build_failures = True ,
281+ use_python_runtime = True ,
282+ torch_executed_ops = {torch .ops .aten .add .Tensor },
283+ )
284+
285+ with torchtrt .runtime .enable_pre_allocated_outputs (optimized_model ):
286+ res1 = optimized_model (inputs [0 ])
287+ output_tensors = [
288+ [t .data_ptr () for t in trt_mod .pre_allocated_outputs ]
289+ for name , trt_mod in optimized_model .named_children ()
290+ if "_run_on_acc" in name
291+ ]
292+
293+ _ = optimized_model (inputs [0 ])
294+ new_output_tensors = [
295+ [t .data_ptr () for t in trt_mod .pre_allocated_outputs ]
296+ for name , trt_mod in optimized_model .named_children ()
297+ if "_run_on_acc" in name
298+ ]
299+
300+ # Run to run, output of intermediate engine is reallocated
301+ self .assertTrue (output_tensors [0 ] != new_output_tensors [0 ])
302+ # Run to run, output of output engine is reallocated
303+ self .assertTrue (output_tensors [1 ] != new_output_tensors [1 ])
304+
305+ @parameterized .expand (
306+ [
307+ ("python_runtime" , True ),
308+ ("cpp_runtime" , False ),
309+ ]
310+ )
311+ def test_pre_allocated_outputs_unowned_outputs_multiple_outputs_api_check (self , _ , use_python_runtime ):
312+ class SampleModel (torch .nn .Module ):
313+ def forward (self , x ):
314+ y = torch .ops .aten .mul (x , 7 )
315+ z = torch .ops .aten .add (y , 2 )
316+ a = torch .ops .aten .softmax (z , dim = 0 )
317+ return y , z , a
318+
319+ model = SampleModel ().eval ().cuda ()
320+ inputs = [torch .randn (* INPUT_SIZE ).cuda () for _ in range (TRIALS )]
321+ fx_graph = torch .fx .symbolic_trace (model )
322+
323+ # Validate that the results between Torch and Torch-TRT are similar
324+ optimized_model = torchtrt .compile (
325+ fx_graph ,
326+ "dynamo" ,
327+ inputs [0 ],
328+ min_block_size = 1 ,
329+ pass_through_build_failures = True ,
330+ use_python_runtime = use_python_runtime ,
331+ torch_executed_ops = {torch .ops .aten .add .Tensor },
332+ )
333+
334+ with torchtrt .runtime .enable_pre_allocated_outputs (optimized_model ):
335+ _ = optimized_model (inputs [0 ])
336+ if use_python_runtime :
337+ self .assertTrue (all (seen == expected for seen , expected in zip ([
338+ optimized_model ._run_on_acc_0 .are_output_tensors_unowned (),
339+ optimized_model ._run_on_acc_2 .are_output_tensors_unowned ()
340+ ], [True , True ])))
341+
342+ else :
343+ self .assertTrue (all (seen == expected for seen , expected in zip ([
344+ optimized_model ._run_on_acc_0 .engine .are_output_tensors_unowned (),
345+ optimized_model ._run_on_acc_2 .engine .are_output_tensors_unowned ()
346+ ], [True , True ])))
347+
348+ @parameterized .expand (
349+ [
350+ ("python_runtime" , True ),
351+ ("cpp_runtime" , False ),
352+ ]
353+ )
354+ def test_pre_allocated_outputs_unowned_outputs_multi_outputs (self , _ , use_python_runtime ):
355+ class SampleModel (torch .nn .Module ):
356+ def forward (self , x ):
357+ y = torch .ops .aten .mul (x , 7 )
358+ z = torch .ops .aten .add (y , 2 )
359+ a = torch .ops .aten .softmax (z , dim = 0 )
360+ return y , z , a
361+
362+ model = SampleModel ().eval ().cuda ()
363+ inputs = [torch .randn (* INPUT_SIZE ).cuda () for _ in range (TRIALS )]
364+ fx_graph = torch .fx .symbolic_trace (model )
365+
366+ # Validate that the results between Torch and Torch-TRT are similar
367+ optimized_model = torchtrt .compile (
368+ fx_graph ,
369+ "dynamo" ,
370+ inputs [0 ],
371+ min_block_size = 1 ,
372+ pass_through_build_failures = True ,
373+ use_python_runtime = use_python_runtime ,
374+ torch_executed_ops = {torch .ops .aten .add .Tensor },
375+ )
376+
377+ with torchtrt .runtime .enable_pre_allocated_outputs (optimized_model ):
378+ res_1 = optimized_model (inputs [0 ])
379+ res_2 = optimized_model (inputs [0 ])
380+
381+ torch .testing .assert_close (
382+ res_1 ,
383+ res_2 ,
384+ rtol = 5e-03 ,
385+ atol = 5e-03 ,
386+ equal_nan = True ,
387+ check_dtype = True ,
388+ )
389+
164390 torch ._dynamo .reset ()
165391
166392
0 commit comments