xinhe-nv
diff --git a/‎cpp/include/tensorrt_llm/runtime/virtualMemory.h‎
Lines changed: 3 additions & 3 deletions b/‎cpp/include/tensorrt_llm/runtime/virtualMemory.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/runtime/bindings.cpp‎
Lines changed: 5 additions & 7 deletions b/‎cpp/tensorrt_llm/nanobind/runtime/bindings.cpp‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎cpp/tensorrt_llm/runtime/virtualMemory.cpp‎
Lines changed: 14 additions & 16 deletions b/‎cpp/tensorrt_llm/runtime/virtualMemory.cpp‎
Lines changed: 14 additions & 16 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎
Lines changed: 4 additions & 0 deletions b/‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/model_loader.py‎
Lines changed: 87 additions & 13 deletions b/‎tensorrt_llm/_torch/pyexecutor/model_loader.py‎
Lines changed: 87 additions & 13 deletions
@@ -473,7 +473,7 @@ class CudaVirtualMemoryAllocator
         bool mBackground{};
 
         friend class CudaVirtualMemoryAllocator;
-        friend void setVirtualMemoryAllocator(
+        friend void pushVirtualMemoryAllocator(
             std::string const& tag, RestoreMode mode, std::shared_ptr<CudaStream> backStream);
 
     public:
@@ -566,8 +566,8 @@ namespace tensorrt_llm::runtime
 {
 CudaVirtualMemoryManager& getVirtualMemoryManager();
 CudaVirtualMemoryAllocator getVirtualMemoryAllocator();
-void setVirtualMemoryAllocator(
+void pushVirtualMemoryAllocator(
     std::string const& tag, CudaVirtualMemoryAllocator::RestoreMode mode, std::shared_ptr<CudaStream> backStream);
-void clearVirtualMemoryAllocator();
+void popVirtualMemoryAllocator();
 
 } // namespace tensorrt_llm::runtime
@@ -343,20 +343,18 @@ void initBindings(nb::module_& m)
         nb::rv_policy::reference);
 
     m.def(
-        "set_virtual_memory_allocator",
+        "push_virtual_memory_allocator",
         [](std::string const& tag, tr::CudaVirtualMemoryAllocator::RestoreMode mode, uintptr_t stream)
         {
             static_assert(sizeof(uintptr_t) == sizeof(cudaStream_t));
-            tr::setVirtualMemoryAllocator(tag, mode,
+            tr::pushVirtualMemoryAllocator(tag, mode,
                 std::make_shared<tr::CudaStream>(
                     reinterpret_cast<cudaStream_t>(stream), tensorrt_llm::common::getDevice(), false));
         },
-        "Set the virtual memory allocator and start allocating virtual memory for CUDA allocations",
-        nb::call_guard<nb::gil_scoped_release>());
+        "Push a virtual memory allocator onto the allocator stack.", nb::call_guard<nb::gil_scoped_release>());
 
-    m.def("clear_virtual_memory_allocator", &tr::clearVirtualMemoryAllocator,
-        "Reset the current virtual memory allocator and stop allocating virtual memory for CUDA allocations",
-        nb::call_guard<nb::gil_scoped_release>());
+    m.def("pop_virtual_memory_allocator", &tr::popVirtualMemoryAllocator,
+        "Pop the top virtual memory allocator from the allocator stack", nb::call_guard<nb::gil_scoped_release>());
 
     nb::class_<tensorrt_llm::runtime::McastGPUBuffer>(m, "McastGPUBuffer")
         .def(nb::init<size_t, uint32_t, uint32_t, uint32_t, bool, int64_t>(), nb::arg("buf_size"),
 
@@ -402,32 +402,30 @@ using AllocConf = CudaVirtualMemoryAllocator::Configuration;
 
 AllocConf AllocConf::backgroundConfiguration{getVirtualMemoryManager(), "", NONE, nullptr, true};
 
-static const std::shared_ptr<AllocConf> bgConf{std::shared_ptr<AllocConf>{}, &AllocConf::backgroundConfiguration};
-
-static std::shared_mutex currentConfMutex;
-static std::shared_ptr<AllocConf> currentConf = bgConf;
+static std::shared_mutex sConfMutex;
+static std::shared_ptr<AllocConf> sCurrentConf{std::shared_ptr<AllocConf>{}, &AllocConf::backgroundConfiguration};
+static std::vector<std::shared_ptr<AllocConf>> sConfStack;
 
 CudaVirtualMemoryAllocator getVirtualMemoryAllocator()
 {
-    std::shared_lock lock(currentConfMutex);
-    return CudaVirtualMemoryAllocator{currentConf};
+    std::shared_lock lock(sConfMutex);
+    return CudaVirtualMemoryAllocator{sCurrentConf};
 }
 
-void setVirtualMemoryAllocator(
+void pushVirtualMemoryAllocator(
     std::string const& tag, CudaVirtualMemoryAllocator::RestoreMode mode, std::shared_ptr<CudaStream> backStream)
 {
-    std::unique_lock lock(currentConfMutex);
-
-    TLLM_CHECK_WITH_INFO(currentConf == bgConf,
-        "An active virtual memory allocator (tag: %s, mode: %d, stream: %p) is already present",
-        currentConf->mTag.c_str(), currentConf->mMode, currentConf->mBackStream.get());
-    currentConf = std::make_shared<AllocConf>(getVirtualMemoryManager(), tag, mode, backStream);
+    std::unique_lock lock(sConfMutex);
+    sCurrentConf.swap(
+        sConfStack.emplace_back(std::make_shared<AllocConf>(getVirtualMemoryManager(), tag, mode, backStream)));
 }
 
-void clearVirtualMemoryAllocator()
+void popVirtualMemoryAllocator()
 {
-    std::unique_lock lock(currentConfMutex);
-    currentConf = bgConf;
+    std::unique_lock lock(sConfMutex);
+    TLLM_CHECK_WITH_INFO(!sConfStack.empty(), "popVirtualMemoryAllocator called with empty stack");
+    sCurrentConf.swap(sConfStack.back());
+    sConfStack.pop_back();
 }
 
 } // namespace tensorrt_llm::runtime
@@ -146,6 +146,8 @@ def __init__(
                                                  torch.nn.Module]] = None,
         model: Optional[torch.nn.Module] = None,
         checkpoint_loader: Optional[BaseCheckpointLoader] = None,
+        model_weights_memory_tag: Optional[str] = None,
+        model_weights_restore_mode=None,
     ):
         self.forward_pass_callable = None
         self.ub_buffers = None
@@ -212,6 +214,8 @@ def __init__(
                 max_num_tokens=self.max_num_tokens,
                 max_seq_len=self.max_seq_len,
                 lora_config=lora_config,
+                model_weights_memory_tag=model_weights_memory_tag,
+                model_weights_restore_mode=model_weights_restore_mode,
             )
             self.model, moe_load_balancer = self.model_loader.load(
                 checkpoint_dir=model_path, checkpoint_loader=checkpoint_loader)
 
@@ -2,14 +2,15 @@
 import inspect
 import os
 import traceback
+import warnings
 from typing import Callable, Optional, Tuple
 
 import torch
 
 from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import (
     AutoCheckpointMapper, BaseCheckpointLoader)
 from tensorrt_llm._utils import str_dtype_to_torch
-from tensorrt_llm.llmapi.llm_args import TorchLlmArgs
+from tensorrt_llm.llmapi.llm_args import ExecutorMemoryType, TorchLlmArgs
 from tensorrt_llm.llmapi.llm_utils import apply_model_defaults_to_llm_args
 from tensorrt_llm.logger import logger
 from tensorrt_llm.lora_helper import LoraConfig
@@ -25,6 +26,8 @@
                                      timing)
 from ..modules.fused_moe.moe_load_balancer import (
     MoeLoadBalancer, maybe_create_moe_load_balancer)
+from ..virtual_memory import RestoreMode
+from ..virtual_memory import scope as virtual_memory_scope
 
 _KV_CACHE_MAP = {
     "fp8": QuantAlgo.FP8.value,
@@ -182,6 +185,15 @@ def _construct_checkpoint_loader(
     return checkpoint_loader
 
 
+def _apply_to_buffers_only(model: torch.nn.Module, fn):
+    """Apply *fn* to every buffer in *model*, skipping parameters.
+    """
+    for module in model.modules():
+        for key, buf in module._buffers.items():
+            if buf is not None:
+                module._buffers[key] = fn(buf)
+
+
 class ModelLoader:
     """
     Handles the loading, configuration, and weight initialization of a PyTorch model.
@@ -195,7 +207,9 @@ def __init__(self,
                  sparse_attention_config: Optional["SparseAttentionConfig"],
                  max_num_tokens: int,
                  max_seq_len: Optional[int],
-                 lora_config: Optional[LoraConfig] = None):
+                 lora_config: Optional[LoraConfig] = None,
+                 model_weights_memory_tag: Optional[ExecutorMemoryType] = None,
+                 model_weights_restore_mode: Optional[RestoreMode] = None):
         """
         Initializes the ModelLoader.
 
@@ -206,6 +220,11 @@ def __init__(self,
             max_num_tokens: The maximum number of tokens the engine will handle.
             max_seq_len: The maximum sequence length.
             lora_config: Configuration for LoRA.
+            model_weights_memory_tag: When set, parameter allocations during
+                ``load()`` are placed under a separate virtual-memory tag so
+                they can be released/materialized independently of buffers.
+            model_weights_restore_mode: RestoreMode for the model weights
+                virtual-memory scope.
         """
         self.llm_args = llm_args
         self.mapping = mapping
@@ -214,6 +233,9 @@ def __init__(self,
         self.max_num_tokens = max_num_tokens
         self.max_seq_len = max_seq_len
         self.lora_config = lora_config
+        self.model_weights_memory_tag = model_weights_memory_tag
+        self.model_weights_restore_mode = model_weights_restore_mode
+        self._weight_pool_proxy = None
 
     @staticmethod
     def load_config_and_apply_defaults(
@@ -275,29 +297,81 @@ def load(
                 config_copy = copy.deepcopy(config)
                 with MetaInitMode():
                     model = AutoModelForCausalLM.from_config(config_copy)
+                config = config_copy
+                is_meta_init = True
+            except Exception:
+                logger.info(
+                    f"Fallback to regular model init: {traceback.format_exc(limit=10)}"
+                )
+                model = AutoModelForCausalLM.from_config(config)
+                is_meta_init = False
+
+            memo = dict()
+
+            if self.model_weights_memory_tag is not None:
+                # Allocate buffers to the outer virtual_memory_scope,
+                # but parameters (weights) to the dedicated inner virtual_memory_scope.
+
+                def allocate_buffer_on_cuda(t: torch.Tensor):
+                    if t not in memo:
+                        if t.device == torch.device('meta'):
+                            cuda_t = torch.empty_like(t, device='cuda')
+                        else:
+                            cuda_t = t.cuda()
+                        memo[t] = cuda_t
+                        memo[cuda_t] = cuda_t
+                    return memo[t]
 
-                memo = dict()
+                _apply_to_buffers_only(model, allocate_buffer_on_cuda)
+
+                need_initialized_weights = load_format not in (LoadFormat.AUTO,
+                                                               LoadFormat.DUMMY)
+
+                def allocate_weights_on_cuda(t: torch.Tensor):
+                    if t not in memo:
+                        cuda_t = torch.empty_like(t, device='cuda')
+                        if t.device != torch.device('meta') and (
+                                need_initialized_weights or is_meta_init):
+                            if t.is_cuda:
+                                memory_type_map = {
+                                    ExecutorMemoryType.MODEL_WEIGHTS_MAIN:
+                                    ExecutorMemoryType.MODEL_ENGINE_MAIN,
+                                    ExecutorMemoryType.MODEL_WEIGHTS_DRAFT:
+                                    ExecutorMemoryType.MODEL_ENGINE_DRAFT,
+                                }
+
+                                warnings.warn(
+                                    f"A weight tensor of shape {t.shape} is already allocated on CUDA device before "
+                                    f"the weight allocation stage. This will cause extra CUDA memory usage in the "
+                                    f"'{memory_type_map[self.model_weights_memory_tag]}' scope."
+                                )
+                            cuda_t.copy_(t)
+                        memo[t] = cuda_t
+                        memo[cuda_t] = cuda_t
+                    return memo[t]
+
+                with virtual_memory_scope(
+                        self.model_weights_memory_tag,
+                        self.model_weights_restore_mode) as pool:
+                    model._apply(allocate_weights_on_cuda)
+                self._weight_pool_proxy = pool
+            elif is_meta_init:
 
                 def init_meta_tensor(t: torch.Tensor):
                     if t.device != torch.device('meta'):
                         return t
+
                     if t not in memo:
                         memo[t] = torch.empty_like(t, device='cuda')
                     return memo[t]
 
                 model._apply(init_meta_tensor)
-                config = config_copy
-
-            except Exception:
-                logger.info(
-                    f"Fallback to regular model init: {traceback.format_exc(limit=10)}\n"
-                )
-                model = AutoModelForCausalLM.from_config(config)
-            finally:
-                if 'memo' in locals():
-                    del memo
 
+            # Ensure everything is at least on CUDA
+            # No-op if worked as expected
             model.to("cuda")
+            del memo
+
             rank_model_storage = get_rank_model_storage(model)
             logger.info(
                 f"Use {rank_model_storage / (1024**3):.2f} GB for model weights."