Fixed comments and some CI tests

cehongwang · cehongwang · commit 74b921e1eb9c · 2025-10-30T22:04:43.000Z
diff --git a/docsrc/contributors/resource_management.rst b/docsrc/contributors/resource_management.rst
@@ -18,7 +18,7 @@ Memory Usage Control
 CPU Memory
 ^^^^^^^^^^
 
-By default, Torch-TensorRT may consume up to **5×** the model size in CPU memory.  
+By default, Torch-TensorRT may consume up to **5x** the model size in CPU memory.  
 This can exceed system limits when compiling large models.
 
 **Common symptoms of high CPU memory usage:**
@@ -34,10 +34,10 @@ This can exceed system limits when compiling large models.
 
    .. code-block:: bash
 
-      export TRIM_CPU_MEMORY=1
+      export TORCHTRT_ENABLE_BUILDER_MALLOC_TRIM=1
 
-   This reduces approximately **2×** of redundant model copies, limiting 
-   total CPU memory usage to up to **3×** the model size.
+   This reduces approximately **2x** of redundant model copies, limiting 
+   total CPU memory usage to up to **3x** the model size.
 
 2. **Disable CPU offloading**
 
@@ -47,13 +47,13 @@ This can exceed system limits when compiling large models.
 
       offload_module_to_cpu = False
 
-   This removes another **1×** model copy, reducing peak CPU memory 
-   usage to about **2×** the model size.
+   This removes another **1x** model copy, reducing peak CPU memory 
+   usage to about **2x** the model size.
 
 GPU Memory
 ^^^^^^^^^^
 
-By default, Torch-TensorRT may consume up to **2×** the model size in GPU memory.
+By default, Torch-TensorRT may consume up to **2x** the model size in GPU memory.
 
 **Common symptoms of high GPU memory usage:**
 
@@ -71,7 +71,7 @@ By default, Torch-TensorRT may consume up to **2×** the model size in GPU memor
       offload_module_to_cpu = True
 
    This shifts one model copy from GPU to CPU memory.  
-   As a result, peak GPU memory usage decreases to about **1×** 
-   the model size, while CPU memory usage increases by roughly **1×**.
+   As a result, peak GPU memory usage decreases to about **1x** 
+   the model size, while one more copy of the model will occupy the CPU memory so CPU memory usage increases by roughly **1x**.
 
 
diff --git a/docsrc/index.rst b/docsrc/index.rst
@@ -234,6 +234,7 @@ Contributor Documentation
    contributors/writing_dynamo_aten_lowering_passes
    contributors/ts_converters
    contributors/useful_links
+   contributors/resource_management
 
 Indices
 ----------------
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -1,5 +1,4 @@
 import gc
-import io
 import logging
 import os
 import warnings
@@ -595,32 +594,6 @@ def _save_weight_mapping(self) -> None:
         gc.collect()
         torch.cuda.empty_cache()
 
-    @needs_refit  # type: ignore[misc]
-    def _insert_engine_to_cache(self, hash_val: str, engine: trt.ICudaEngine) -> None:
-        serialized_engine = engine.serialize()
-        # TODO: @Evan is waiting for TRT's feature to cache the weight-stripped engine
-        # if not self.compilation_settings.strip_engine_weights:
-        #     # set EXCLUDE_WEIGHTS flag to strip weights
-        #     serialization_config = engine.create_serialization_config()
-        #     serialization_config.set_flag(trt.SerializationFlag.EXCLUDE_WEIGHTS)
-        #     serialized_engine = engine.serialize_with_config(
-        #         serialization_config
-        #     )
-
-        # Cache weighted engine for now
-        self.engine_cache.insert(  # type: ignore[union-attr]
-            hash_val,
-            (
-                serialized_engine,
-                self._input_names,
-                self._output_names,
-                self.input_specs,
-                self.compilation_settings,
-                self.weight_name_map,
-                self.ctx.requires_output_allocator,
-            ),
-        )
-
     @needs_refit  # type: ignore[misc]
     def _pull_cached_engine(self, hash_val: str) -> Optional[TRTInterpreterResult]:
         # query the cached TRT engine
@@ -673,7 +646,6 @@ def _pull_cached_engine(self, hash_val: str) -> Optional[TRTInterpreterResult]:
                     settings=self.compilation_settings,
                     weight_name_map=self.weight_name_map,
                 )
-                serialized_engine = engine.serialize()
 
                 # TODO: @Evan is waiting for TRT's feature to load the weight-stripped engine
                 # # EXCLUDE_WEIGHTS flag must be cleared
@@ -686,12 +658,8 @@ def _pull_cached_engine(self, hash_val: str) -> Optional[TRTInterpreterResult]:
                 # )
                 # # As of now, the engine becomes non-refittable because when EXCLUDE_WEIGHTS flag is cleared, the REFIT flag is also cleared by TRT to make the plan file smaller
 
-            with io.BytesIO() as engine_bytes:
-                engine_bytes.write(serialized_engine)
-                engine_str = engine_bytes.getvalue()
-
             return TRTInterpreterResult(
-                engine_str,
+                engine,
                 self._input_names,
                 self._output_names,
                 self.weight_name_map,
@@ -774,14 +742,6 @@ def run(
             builder_config, self.compilation_settings.timing_cache_path
         )
 
-        # Engine caching only for refittable engines
-        if (
-            not self.compilation_settings.immutable_weights
-            and self.compilation_settings.cache_built_engines
-            and self.engine_cache is not None
-        ):
-            self._insert_engine_to_cache(hash_val, cuda_engine)
-
         return TRTInterpreterResult(
             cuda_engine,
             self._input_names,
diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -15,7 +15,7 @@
 from torch_tensorrt.dynamo.utils import (
     get_cpu_memory_usage,
     get_output_dtypes,
-    release_memory,
+    release_host_and_device_memory,
 )
 
 logger = logging.getLogger(__name__)
@@ -62,6 +62,7 @@ def interpret_module_to_result(
     Returns:
         TRTInterpreterResult
     """
+
     output_dtypes = infer_module_output_dtypes(
         module, truncate_double=settings.truncate_double
     )
@@ -80,7 +81,7 @@ def interpret_module_to_result(
     for attr in dir(module):
         if attr.startswith("_frozen_param"):
             delattr(module, attr)
-    release_memory()
+    release_host_and_device_memory()
     logger.debug(
         f"CPU memory usage after clearing frozen parameters and building memory in conversion: {get_cpu_memory_usage()} MB"
     )
@@ -92,6 +93,27 @@ def interpret_module_to_result(
         logger.debug(
             f"CPU memory usage after serializing engine: {get_cpu_memory_usage()} MB"
         )
+
+    # Engine caching only for refittable engines
+    if (
+        not settings.immutable_weights
+        and settings.cache_built_engines
+        and engine_cache is not None
+    ):
+        hash_val = engine_cache.get_hash(module, inputs, settings)
+        engine_cache.insert(
+            hash_val,
+            (
+                serialized_engine,
+                interpreter_result.input_names,
+                interpreter_result.output_names,
+                inputs,
+                settings,
+                interpreter_result.weight_name_map,
+                interpreter_result.requires_output_allocator,
+            ),
+        )
+
     serialized_interpreter_result = SerializedInterpreterResult(
         serialized_engine=serialized_engine,
         input_names=interpreter_result.input_names,
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py b/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py
@@ -36,11 +36,16 @@ def constant_fold(
     # The constants are created on CPU to save GPU memory for TensorRT compilation.
     # For TRT INetwork construction the constants are moved to CPU in get_attr call.
     for node, constant in cf.node_replacements.items():
-        replace_node_with_constant(
-            gm,
-            node,
-            torch.nn.Parameter(constant.cpu().contiguous(), requires_grad=False),
-        )
+        if settings.offload_module_to_cpu:
+            replace_node_with_constant(
+                gm,
+                node,
+                torch.nn.Parameter(constant.cpu().contiguous(), requires_grad=False),
+            )
+        else:
+            replace_node_with_constant(
+                gm, node, torch.nn.Parameter(constant, requires_grad=False)
+            )
 
     erased_params = []
     for node in gm.graph.nodes:
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
@@ -875,15 +875,18 @@ def get_cpu_memory_usage() -> Any:
     return psutil.Process().memory_info().rss / 1024 / 1024
 
 
-def release_memory() -> None:
+def release_host_and_device_memory() -> None:
     gc.collect()
     if torch.cuda.is_available():
         torch.cuda.synchronize()
         torch.cuda.empty_cache()
         torch.cuda.ipc_collect()
         torch.cuda.synchronize()
 
-    if platform.system() == "Linux" and os.environ.get("TRIM_CPU_MEMORY", "0") == "1":
+    if (
+        platform.system() == "Linux"
+        and os.environ.get("TORCHTRT_ENABLE_BUILDER_MALLOC_TRIM", "0") == "1"
+    ):
         try:
             libc = ctypes.CDLL("libc.so.6")
             if libc.malloc_trim(0) != 1:
diff --git a/tests/py/dynamo/conversion/harness.py b/tests/py/dynamo/conversion/harness.py
@@ -208,8 +208,9 @@ def run_test(
             interpreter_result = interpreter.run()
             sec = time.perf_counter() - start
             _LOGGER.info(f"Interpreter run time(s): {sec}")
+            serialized_engine = interpreter_result.engine.serialize()
             trt_mod = rt_cls(
-                serialized_engine=interpreter_result.serialized_engine,
+                serialized_engine=serialized_engine,
                 input_binding_names=list(interpreter_result.input_names),
                 output_binding_names=list(interpreter_result.output_names),
                 name="test_engine",