Skip to content

Commit e3909ac

Browse files
committed
tests: Adding additional test cases for the unowned tensor feature
1 parent 99660e6 commit e3909ac

File tree

1 file changed

+229
-3
lines changed

1 file changed

+229
-3
lines changed

tests/py/dynamo/runtime/test_pre_allocated_outputs.py

Lines changed: 229 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def forward(self, x):
125125
)
126126
torch._dynamo.reset()
127127

128-
def test_pre_allocated_outputs_unowned_outputs(self):
128+
def test_pre_allocated_outputs_unowned_outputs_py_api_check_no_realloc(self):
129129
class SampleModel(torch.nn.Module):
130130
def forward(self, x):
131131
return torch.softmax(x * 7 + 2, dim=0)
@@ -146,21 +146,247 @@ def forward(self, x):
146146
)
147147

148148
with torchtrt.runtime.enable_pre_allocated_outputs(optimized_model):
149-
optimized_model(inputs[0])
149+
_ = optimized_model(inputs[0])
150150
output_tensors = [
151151
trt_mod.pre_allocated_outputs
152152
for name, trt_mod in optimized_model.named_children()
153153
if "_run_on_acc" in name
154154
]
155-
optimized_model(inputs[0])
155+
_ = optimized_model(inputs[0])
156156
new_output_tensors = [
157157
trt_mod.pre_allocated_outputs
158158
for name, trt_mod in optimized_model.named_children()
159159
if "_run_on_acc" in name
160160
]
161+
162+
# Run to run, output of intermediate engine is not reallocated
161163
self.assertTrue(output_tensors[0] is new_output_tensors[0])
164+
# Run to run, output of output engine is reallocated
162165
self.assertTrue(output_tensors[1] is not new_output_tensors[1])
163166

167+
@parameterized.expand(
168+
[
169+
("python_runtime", True),
170+
("cpp_runtime", False),
171+
]
172+
)
173+
def test_pre_allocated_outputs_unowned_outputs_api_check(self, _, use_python_runtime):
174+
class SampleModel(torch.nn.Module):
175+
def forward(self, x):
176+
return torch.softmax(x * 7 + 2, dim=0)
177+
178+
model = SampleModel().eval().cuda()
179+
inputs = [torch.randn(*INPUT_SIZE).cuda() for _ in range(TRIALS)]
180+
fx_graph = torch.fx.symbolic_trace(model)
181+
182+
# Validate that the results between Torch and Torch-TRT are similar
183+
optimized_model = torchtrt.compile(
184+
fx_graph,
185+
"dynamo",
186+
inputs[0],
187+
min_block_size=1,
188+
pass_through_build_failures=True,
189+
use_python_runtime=use_python_runtime,
190+
torch_executed_ops={torch.ops.aten.add.Tensor},
191+
)
192+
193+
with torchtrt.runtime.enable_pre_allocated_outputs(optimized_model):
194+
_ = optimized_model(inputs[0])
195+
if use_python_runtime:
196+
self.assertTrue(all(seen == expected for seen, expected in zip([
197+
optimized_model._run_on_acc_0.are_output_tensors_unowned(),
198+
optimized_model._run_on_acc_2.are_output_tensors_unowned()
199+
], [False, True])))
200+
201+
else:
202+
self.assertTrue(all(seen == expected for seen, expected in zip([
203+
optimized_model._run_on_acc_0.engine.are_output_tensors_unowned(),
204+
optimized_model._run_on_acc_2.engine.are_output_tensors_unowned()
205+
], [False, True])))
206+
207+
@parameterized.expand(
208+
[
209+
("python_runtime", True),
210+
("cpp_runtime", False),
211+
]
212+
)
213+
def test_pre_allocated_outputs_unowned_outputs(self, _, use_python_runtime):
214+
class SampleModel(torch.nn.Module):
215+
def forward(self, x):
216+
return torch.softmax(x * 7 + 2, dim=0)
217+
218+
model = SampleModel().eval().cuda()
219+
inputs = [torch.randn(*INPUT_SIZE).cuda() for _ in range(TRIALS)]
220+
fx_graph = torch.fx.symbolic_trace(model)
221+
222+
# Validate that the results between Torch and Torch-TRT are similar
223+
optimized_model = torchtrt.compile(
224+
fx_graph,
225+
"dynamo",
226+
inputs[0],
227+
min_block_size=1,
228+
pass_through_build_failures=True,
229+
use_python_runtime=use_python_runtime,
230+
torch_executed_ops={torch.ops.aten.add.Tensor},
231+
)
232+
233+
torch_res = model(inputs[0])
234+
235+
with torchtrt.runtime.enable_pre_allocated_outputs(optimized_model):
236+
res_1 = optimized_model(inputs[0])
237+
res_2 = optimized_model(inputs[0])
238+
239+
# Results are correct
240+
torch.testing.assert_close(
241+
torch_res,
242+
res_1,
243+
rtol=5e-03,
244+
atol=5e-03,
245+
equal_nan=True,
246+
check_dtype=True,
247+
)
248+
249+
# Results between runs are identical
250+
torch.testing.assert_close(
251+
res_1,
252+
res_2,
253+
rtol=5e-03,
254+
atol=5e-03,
255+
equal_nan=True,
256+
check_dtype=True,
257+
)
258+
259+
torch._dynamo.reset()
260+
261+
262+
def test_pre_allocated_outputs_unowned_outputs_multiple_outputs_py_api_check_no_realloc(self):
263+
class SampleModel(torch.nn.Module):
264+
def forward(self, x):
265+
y = torch.ops.aten.mul(x, 7)
266+
z = torch.ops.aten.add(y, 2)
267+
a = torch.ops.aten.softmax(z, dim=0)
268+
return y, z, a
269+
270+
model = SampleModel().eval().cuda()
271+
inputs = [torch.randn(*INPUT_SIZE).cuda() for _ in range(TRIALS)]
272+
fx_graph = torch.fx.symbolic_trace(model)
273+
274+
# Validate that the results between Torch and Torch-TRT are similar
275+
optimized_model = torchtrt.compile(
276+
fx_graph,
277+
"dynamo",
278+
inputs[0],
279+
min_block_size=1,
280+
pass_through_build_failures=True,
281+
use_python_runtime=True,
282+
torch_executed_ops={torch.ops.aten.add.Tensor},
283+
)
284+
285+
with torchtrt.runtime.enable_pre_allocated_outputs(optimized_model):
286+
res1 = optimized_model(inputs[0])
287+
output_tensors = [
288+
[t.data_ptr() for t in trt_mod.pre_allocated_outputs]
289+
for name, trt_mod in optimized_model.named_children()
290+
if "_run_on_acc" in name
291+
]
292+
293+
_ = optimized_model(inputs[0])
294+
new_output_tensors = [
295+
[t.data_ptr() for t in trt_mod.pre_allocated_outputs]
296+
for name, trt_mod in optimized_model.named_children()
297+
if "_run_on_acc" in name
298+
]
299+
300+
# Run to run, output of intermediate engine is reallocated
301+
self.assertTrue(output_tensors[0] != new_output_tensors[0])
302+
# Run to run, output of output engine is reallocated
303+
self.assertTrue(output_tensors[1] != new_output_tensors[1])
304+
305+
@parameterized.expand(
306+
[
307+
("python_runtime", True),
308+
("cpp_runtime", False),
309+
]
310+
)
311+
def test_pre_allocated_outputs_unowned_outputs_multiple_outputs_api_check(self, _, use_python_runtime):
312+
class SampleModel(torch.nn.Module):
313+
def forward(self, x):
314+
y = torch.ops.aten.mul(x, 7)
315+
z = torch.ops.aten.add(y, 2)
316+
a = torch.ops.aten.softmax(z, dim=0)
317+
return y, z, a
318+
319+
model = SampleModel().eval().cuda()
320+
inputs = [torch.randn(*INPUT_SIZE).cuda() for _ in range(TRIALS)]
321+
fx_graph = torch.fx.symbolic_trace(model)
322+
323+
# Validate that the results between Torch and Torch-TRT are similar
324+
optimized_model = torchtrt.compile(
325+
fx_graph,
326+
"dynamo",
327+
inputs[0],
328+
min_block_size=1,
329+
pass_through_build_failures=True,
330+
use_python_runtime=use_python_runtime,
331+
torch_executed_ops={torch.ops.aten.add.Tensor},
332+
)
333+
334+
with torchtrt.runtime.enable_pre_allocated_outputs(optimized_model):
335+
_ = optimized_model(inputs[0])
336+
if use_python_runtime:
337+
self.assertTrue(all(seen == expected for seen, expected in zip([
338+
optimized_model._run_on_acc_0.are_output_tensors_unowned(),
339+
optimized_model._run_on_acc_2.are_output_tensors_unowned()
340+
], [True, True])))
341+
342+
else:
343+
self.assertTrue(all(seen == expected for seen, expected in zip([
344+
optimized_model._run_on_acc_0.engine.are_output_tensors_unowned(),
345+
optimized_model._run_on_acc_2.engine.are_output_tensors_unowned()
346+
], [True, True])))
347+
348+
@parameterized.expand(
349+
[
350+
("python_runtime", True),
351+
("cpp_runtime", False),
352+
]
353+
)
354+
def test_pre_allocated_outputs_unowned_outputs_multi_outputs(self, _, use_python_runtime):
355+
class SampleModel(torch.nn.Module):
356+
def forward(self, x):
357+
y = torch.ops.aten.mul(x, 7)
358+
z = torch.ops.aten.add(y, 2)
359+
a = torch.ops.aten.softmax(z, dim=0)
360+
return y, z, a
361+
362+
model = SampleModel().eval().cuda()
363+
inputs = [torch.randn(*INPUT_SIZE).cuda() for _ in range(TRIALS)]
364+
fx_graph = torch.fx.symbolic_trace(model)
365+
366+
# Validate that the results between Torch and Torch-TRT are similar
367+
optimized_model = torchtrt.compile(
368+
fx_graph,
369+
"dynamo",
370+
inputs[0],
371+
min_block_size=1,
372+
pass_through_build_failures=True,
373+
use_python_runtime=use_python_runtime,
374+
torch_executed_ops={torch.ops.aten.add.Tensor},
375+
)
376+
377+
with torchtrt.runtime.enable_pre_allocated_outputs(optimized_model):
378+
res_1 = optimized_model(inputs[0])
379+
res_2 = optimized_model(inputs[0])
380+
381+
torch.testing.assert_close(
382+
res_1,
383+
res_2,
384+
rtol=5e-03,
385+
atol=5e-03,
386+
equal_nan=True,
387+
check_dtype=True,
388+
)
389+
164390
torch._dynamo.reset()
165391

166392

0 commit comments

Comments
 (0)