Skip to content

Commit 74b921e

Browse files
committed
Fixed comments and some CI tests
1 parent f77df5a commit 74b921e

File tree

7 files changed

+52
-60
lines changed

7 files changed

+52
-60
lines changed

docsrc/contributors/resource_management.rst

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ Memory Usage Control
1818
CPU Memory
1919
^^^^^^^^^^
2020

21-
By default, Torch-TensorRT may consume up to **** the model size in CPU memory.
21+
By default, Torch-TensorRT may consume up to **5x** the model size in CPU memory.
2222
This can exceed system limits when compiling large models.
2323

2424
**Common symptoms of high CPU memory usage:**
@@ -34,10 +34,10 @@ This can exceed system limits when compiling large models.
3434

3535
.. code-block:: bash
3636
37-
export TRIM_CPU_MEMORY=1
37+
export TORCHTRT_ENABLE_BUILDER_MALLOC_TRIM=1
3838
39-
This reduces approximately **** of redundant model copies, limiting
40-
total CPU memory usage to up to **** the model size.
39+
This reduces approximately **2x** of redundant model copies, limiting
40+
total CPU memory usage to up to **3x** the model size.
4141

4242
2. **Disable CPU offloading**
4343

@@ -47,13 +47,13 @@ This can exceed system limits when compiling large models.
4747
4848
offload_module_to_cpu = False
4949
50-
This removes another **** model copy, reducing peak CPU memory
51-
usage to about **** the model size.
50+
This removes another **1x** model copy, reducing peak CPU memory
51+
usage to about **2x** the model size.
5252

5353
GPU Memory
5454
^^^^^^^^^^
5555

56-
By default, Torch-TensorRT may consume up to **** the model size in GPU memory.
56+
By default, Torch-TensorRT may consume up to **2x** the model size in GPU memory.
5757

5858
**Common symptoms of high GPU memory usage:**
5959

@@ -71,7 +71,7 @@ By default, Torch-TensorRT may consume up to **2×** the model size in GPU memor
7171
offload_module_to_cpu = True
7272
7373
This shifts one model copy from GPU to CPU memory.
74-
As a result, peak GPU memory usage decreases to about ****
75-
the model size, while CPU memory usage increases by roughly ****.
74+
As a result, peak GPU memory usage decreases to about **1x**
75+
the model size, while one more copy of the model will occupy the CPU memory so CPU memory usage increases by roughly **1x**.
7676

7777

docsrc/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@ Contributor Documentation
234234
contributors/writing_dynamo_aten_lowering_passes
235235
contributors/ts_converters
236236
contributors/useful_links
237+
contributors/resource_management
237238

238239
Indices
239240
----------------

py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py

Lines changed: 1 addition & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import gc
2-
import io
32
import logging
43
import os
54
import warnings
@@ -595,32 +594,6 @@ def _save_weight_mapping(self) -> None:
595594
gc.collect()
596595
torch.cuda.empty_cache()
597596

598-
@needs_refit # type: ignore[misc]
599-
def _insert_engine_to_cache(self, hash_val: str, engine: trt.ICudaEngine) -> None:
600-
serialized_engine = engine.serialize()
601-
# TODO: @Evan is waiting for TRT's feature to cache the weight-stripped engine
602-
# if not self.compilation_settings.strip_engine_weights:
603-
# # set EXCLUDE_WEIGHTS flag to strip weights
604-
# serialization_config = engine.create_serialization_config()
605-
# serialization_config.set_flag(trt.SerializationFlag.EXCLUDE_WEIGHTS)
606-
# serialized_engine = engine.serialize_with_config(
607-
# serialization_config
608-
# )
609-
610-
# Cache weighted engine for now
611-
self.engine_cache.insert( # type: ignore[union-attr]
612-
hash_val,
613-
(
614-
serialized_engine,
615-
self._input_names,
616-
self._output_names,
617-
self.input_specs,
618-
self.compilation_settings,
619-
self.weight_name_map,
620-
self.ctx.requires_output_allocator,
621-
),
622-
)
623-
624597
@needs_refit # type: ignore[misc]
625598
def _pull_cached_engine(self, hash_val: str) -> Optional[TRTInterpreterResult]:
626599
# query the cached TRT engine
@@ -673,7 +646,6 @@ def _pull_cached_engine(self, hash_val: str) -> Optional[TRTInterpreterResult]:
673646
settings=self.compilation_settings,
674647
weight_name_map=self.weight_name_map,
675648
)
676-
serialized_engine = engine.serialize()
677649

678650
# TODO: @Evan is waiting for TRT's feature to load the weight-stripped engine
679651
# # EXCLUDE_WEIGHTS flag must be cleared
@@ -686,12 +658,8 @@ def _pull_cached_engine(self, hash_val: str) -> Optional[TRTInterpreterResult]:
686658
# )
687659
# # As of now, the engine becomes non-refittable because when EXCLUDE_WEIGHTS flag is cleared, the REFIT flag is also cleared by TRT to make the plan file smaller
688660

689-
with io.BytesIO() as engine_bytes:
690-
engine_bytes.write(serialized_engine)
691-
engine_str = engine_bytes.getvalue()
692-
693661
return TRTInterpreterResult(
694-
engine_str,
662+
engine,
695663
self._input_names,
696664
self._output_names,
697665
self.weight_name_map,
@@ -774,14 +742,6 @@ def run(
774742
builder_config, self.compilation_settings.timing_cache_path
775743
)
776744

777-
# Engine caching only for refittable engines
778-
if (
779-
not self.compilation_settings.immutable_weights
780-
and self.compilation_settings.cache_built_engines
781-
and self.engine_cache is not None
782-
):
783-
self._insert_engine_to_cache(hash_val, cuda_engine)
784-
785745
return TRTInterpreterResult(
786746
cuda_engine,
787747
self._input_names,

py/torch_tensorrt/dynamo/conversion/_conversion.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from torch_tensorrt.dynamo.utils import (
1616
get_cpu_memory_usage,
1717
get_output_dtypes,
18-
release_memory,
18+
release_host_and_device_memory,
1919
)
2020

2121
logger = logging.getLogger(__name__)
@@ -62,6 +62,7 @@ def interpret_module_to_result(
6262
Returns:
6363
TRTInterpreterResult
6464
"""
65+
6566
output_dtypes = infer_module_output_dtypes(
6667
module, truncate_double=settings.truncate_double
6768
)
@@ -80,7 +81,7 @@ def interpret_module_to_result(
8081
for attr in dir(module):
8182
if attr.startswith("_frozen_param"):
8283
delattr(module, attr)
83-
release_memory()
84+
release_host_and_device_memory()
8485
logger.debug(
8586
f"CPU memory usage after clearing frozen parameters and building memory in conversion: {get_cpu_memory_usage()} MB"
8687
)
@@ -92,6 +93,27 @@ def interpret_module_to_result(
9293
logger.debug(
9394
f"CPU memory usage after serializing engine: {get_cpu_memory_usage()} MB"
9495
)
96+
97+
# Engine caching only for refittable engines
98+
if (
99+
not settings.immutable_weights
100+
and settings.cache_built_engines
101+
and engine_cache is not None
102+
):
103+
hash_val = engine_cache.get_hash(module, inputs, settings)
104+
engine_cache.insert(
105+
hash_val,
106+
(
107+
serialized_engine,
108+
interpreter_result.input_names,
109+
interpreter_result.output_names,
110+
inputs,
111+
settings,
112+
interpreter_result.weight_name_map,
113+
interpreter_result.requires_output_allocator,
114+
),
115+
)
116+
95117
serialized_interpreter_result = SerializedInterpreterResult(
96118
serialized_engine=serialized_engine,
97119
input_names=interpreter_result.input_names,

py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,16 @@ def constant_fold(
3636
# The constants are created on CPU to save GPU memory for TensorRT compilation.
3737
# For TRT INetwork construction the constants are moved to CPU in get_attr call.
3838
for node, constant in cf.node_replacements.items():
39-
replace_node_with_constant(
40-
gm,
41-
node,
42-
torch.nn.Parameter(constant.cpu().contiguous(), requires_grad=False),
43-
)
39+
if settings.offload_module_to_cpu:
40+
replace_node_with_constant(
41+
gm,
42+
node,
43+
torch.nn.Parameter(constant.cpu().contiguous(), requires_grad=False),
44+
)
45+
else:
46+
replace_node_with_constant(
47+
gm, node, torch.nn.Parameter(constant, requires_grad=False)
48+
)
4449

4550
erased_params = []
4651
for node in gm.graph.nodes:

py/torch_tensorrt/dynamo/utils.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -875,15 +875,18 @@ def get_cpu_memory_usage() -> Any:
875875
return psutil.Process().memory_info().rss / 1024 / 1024
876876

877877

878-
def release_memory() -> None:
878+
def release_host_and_device_memory() -> None:
879879
gc.collect()
880880
if torch.cuda.is_available():
881881
torch.cuda.synchronize()
882882
torch.cuda.empty_cache()
883883
torch.cuda.ipc_collect()
884884
torch.cuda.synchronize()
885885

886-
if platform.system() == "Linux" and os.environ.get("TRIM_CPU_MEMORY", "0") == "1":
886+
if (
887+
platform.system() == "Linux"
888+
and os.environ.get("TORCHTRT_ENABLE_BUILDER_MALLOC_TRIM", "0") == "1"
889+
):
887890
try:
888891
libc = ctypes.CDLL("libc.so.6")
889892
if libc.malloc_trim(0) != 1:

tests/py/dynamo/conversion/harness.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,8 +208,9 @@ def run_test(
208208
interpreter_result = interpreter.run()
209209
sec = time.perf_counter() - start
210210
_LOGGER.info(f"Interpreter run time(s): {sec}")
211+
serialized_engine = interpreter_result.engine.serialize()
211212
trt_mod = rt_cls(
212-
serialized_engine=interpreter_result.serialized_engine,
213+
serialized_engine=serialized_engine,
213214
input_binding_names=list(interpreter_result.input_names),
214215
output_binding_names=list(interpreter_result.output_names),
215216
name="test_engine",

0 commit comments

Comments
 (0)