galletas1712
diff --git a/‎tensorrt_llm/_torch/memory/__init__.py‎
Lines changed: 6 additions & 0 deletions b/‎tensorrt_llm/_torch/memory/__init__.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/memory/gpu_memory_backend.py‎
Lines changed: 279 additions & 0 deletions b/‎tensorrt_llm/_torch/memory/gpu_memory_backend.py‎
Lines changed: 279 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/modules/linear.py‎
Lines changed: 31 additions & 24 deletions b/‎tensorrt_llm/_torch/modules/linear.py‎
Lines changed: 31 additions & 24 deletions
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from .gpu_memory_backend import GMSBackend, GPUMemoryBackend
+
+__all__ = ["GPUMemoryBackend", "GMSBackend"]
@@ -0,0 +1,279 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from contextlib import contextmanager
+from typing import Iterator, Optional, Protocol, runtime_checkable
+
+import torch
+from torch import nn
+
+from tensorrt_llm.logger import logger
+from tensorrt_llm.mapping import Mapping
+
+_MODE_ALIASES = ("rw", "ro", "auto")
+
+
+@runtime_checkable
+class GPUMemoryBackend(Protocol):
+    def connect(self) -> bool:
+        ...
+
+    @property
+    def is_rw(self) -> Optional[bool]:
+        ...
+
+    def has_committed_weights(self) -> bool:
+        ...
+
+    def mem_pool_scope(self, device: Optional[torch.device] = None) -> Iterator[None]:
+        ...
+
+    def materialize_module(self, model: nn.Module) -> None:
+        ...
+
+    def finalize_write(self, model: nn.Module) -> int:
+        ...
+
+    def move_untracked_params(self, model: nn.Module) -> None:
+        ...
+
+    def cleanup(self) -> None:
+        ...
+
+
+class GMSBackend:
+    DEFAULT_TAG = "weights"
+
+    def __init__(
+        self,
+        socket_path: Optional[str],
+        mapping: Mapping,
+        mode: str = "auto",
+        tag: str = DEFAULT_TAG,
+    ) -> None:
+        if mode not in _MODE_ALIASES:
+            raise ValueError(
+                f"GMS mode must be one of {_MODE_ALIASES}, got {mode!r}")
+
+        self._socket_path = socket_path
+        self._mapping = mapping
+        self._mode = mode
+        self._tag = tag
+        self._device_index = torch.cuda.current_device()
+        self._client = None
+        self._is_rw: Optional[bool] = None
+
+    def connect(self) -> bool:
+        try:
+            from gpu_memory_service.client.torch.allocator import (
+                get_or_create_gms_client_memory_manager,
+            )
+            from gpu_memory_service.common.locks import GrantedLockType, RequestedLockType
+            from gpu_memory_service.common.utils import get_socket_path
+            from gpu_memory_service.integrations.common.patches import patch_empty_cache
+        except ImportError:
+            logger.warning(
+                "gpu_memory_service is not installed; LoadFormat.GMS is unavailable.")
+            return False
+
+        mode_map = {
+            "rw": RequestedLockType.RW,
+            "ro": RequestedLockType.RO,
+            "auto": RequestedLockType.RW_OR_RO,
+        }
+
+        socket_path = self._socket_path
+        if socket_path is None:
+            socket_path = get_socket_path(self._device_index, self._tag)
+        self._socket_path = socket_path
+
+        try:
+            self._client = get_or_create_gms_client_memory_manager(
+                socket_path,
+                self._device_index,
+                mode=mode_map[self._mode],
+                tag=self._tag,
+            )
+        except Exception as e:
+            logger.warning(
+                "Failed to connect to GMS at %s (mode=%s, tag=%s): %s",
+                socket_path,
+                self._mode,
+                self._tag,
+                e,
+            )
+            self._client = None
+            return False
+
+        self._is_rw = self._client.granted_lock_type == GrantedLockType.RW
+        try:
+            patch_empty_cache()
+        except Exception as e:
+            logger.debug("GMS patch_empty_cache failed (non-fatal): %s", e)
+
+        logger.info(
+            "Connected to GMS at %s (mode=%s, granted=%s, tag=%s)",
+            socket_path,
+            self._mode,
+            "RW" if self._is_rw else "RO",
+            self._tag,
+        )
+        return True
+
+    @property
+    def is_rw(self) -> Optional[bool]:
+        return self._is_rw
+
+    def has_committed_weights(self) -> bool:
+        if self._client is None:
+            return False
+        try:
+            from gpu_memory_service.common.locks import GrantedLockType
+
+            return self._client.granted_lock_type == GrantedLockType.RO
+        except Exception:
+            return False
+
+    @contextmanager
+    def mem_pool_scope(
+        self,
+        device: Optional[torch.device] = None,
+    ) -> Iterator[None]:
+        if self._client is None:
+            raise RuntimeError("GMS client not connected. Call connect() first.")
+        if self._is_rw is False:
+            raise RuntimeError(
+                "GMS mem_pool_scope() is only valid in RW mode (this client was granted RO)."
+            )
+
+        from gpu_memory_service.client.torch.allocator import gms_use_mem_pool
+
+        target_device = device
+        if target_device is None:
+            target_device = torch.device("cuda", self._device_index)
+
+        with gms_use_mem_pool(self._tag, target_device):
+            yield
+
+    def move_untracked_params(self, model: nn.Module) -> None:
+        if self._client is None:
+            raise RuntimeError("GMS client not connected. Call connect() first.")
+
+        from gpu_memory_service.client.torch.module import _iter_module_tensors
+        from gpu_memory_service.client.torch.tensor import _tensor_from_pointer
+
+        gms_client = self._client
+        seen: set[int] = set()
+
+        with torch.no_grad():
+            for _name, tensor, tensor_type in _iter_module_tensors(model):
+                if tensor_type != "parameter" or tensor is None or not tensor.is_cuda:
+                    continue
+
+                storage_ptr = tensor.untyped_storage().data_ptr()
+                if storage_ptr in seen:
+                    continue
+                seen.add(storage_ptr)
+
+                if _ptr_in_gms(gms_client, int(tensor.data_ptr())):
+                    continue
+
+                nbytes = _storage_nbytes(tensor)
+                base_va = gms_client.create_mapping(size=nbytes, tag=self._tag)
+                replacement = _tensor_from_pointer(
+                    int(base_va),
+                    list(tensor.shape),
+                    list(tensor.stride()),
+                    tensor.dtype,
+                    self._device_index,
+                )
+                replacement.copy_(tensor)
+                tensor.data = replacement
+
+    def finalize_write(self, model: nn.Module) -> int:
+        if self._client is None:
+            raise RuntimeError("GMS client not connected. Call connect() first.")
+        if self._is_rw is False:
+            raise RuntimeError("GMS finalize_write() is only valid in RW mode.")
+
+        from gpu_memory_service.client.torch.module import register_module_tensors
+        from gpu_memory_service.integrations.common.utils import finalize_gms_write
+
+        register_module_tensors(self._client, model)
+        bytes_committed = int(self._client.total_bytes)
+        torch.cuda.synchronize()
+        finalize_gms_write(self._client)
+        self._is_rw = False
+        logger.info(
+            "GMS RW->RO: committed %.2f GiB at %s (tag=%s)",
+            bytes_committed / (1 << 30),
+            self._socket_path,
+            self._tag,
+        )
+        return bytes_committed
+
+    def materialize_module(self, model: nn.Module) -> None:
+        if self._client is None:
+            raise RuntimeError("GMS client not connected. Call connect() first.")
+
+        from gpu_memory_service.client.torch.module import materialize_module_from_gms
+        from tensorrt_llm._torch.modules.linear import Linear
+
+        materialize_module_from_gms(
+            self._client,
+            model,
+            device_index=self._device_index,
+        )
+
+        for module in model.modules():
+            if isinstance(module, Linear):
+                module._weights_presharded = True
+
+        logger.info(
+            "GMS RO: materialized weights from %s (tag=%s, tp_rank=%d/%d, total_bytes=%.2f GiB)",
+            self._socket_path,
+            self._tag,
+            self._mapping.tp_rank,
+            self._mapping.tp_size,
+            int(self._client.total_bytes) / (1 << 30),
+        )
+
+    def cleanup(self) -> None:
+        if self._client is None:
+            return
+
+        try:
+            from gpu_memory_service.client.torch.allocator import (
+                evict_gms_client_memory_manager,
+            )
+
+            client = self._client
+            try:
+                client.close()
+            except Exception:
+                pass
+            evict_gms_client_memory_manager(client)
+            logger.info("GMS: disconnected from %s", self._socket_path)
+        except Exception as e:
+            logger.warning("GMS cleanup error: %s", e)
+        finally:
+            self._client = None
+
+
+def _ptr_in_gms(gms_client, ptr: int) -> bool:
+    mappings = getattr(gms_client, "mappings", None)
+    if not mappings:
+        mappings = getattr(gms_client, "_mappings", None)
+    if not mappings:
+        return False
+
+    for mapping in mappings.values():
+        base = int(getattr(mapping, "va", 0))
+        size = int(getattr(mapping, "aligned_size", getattr(mapping, "size", 0)))
+        if base and size and base <= ptr < base + size:
+            return True
+    return False
+
+
+def _storage_nbytes(tensor: torch.Tensor) -> int:
+    return int(tensor.untyped_storage().nbytes())
@@ -183,9 +183,11 @@ def load_weights_vanilla_helper(module: Linear,
         if module.bias is not None:
             assert "bias" in weights[0]
     device = torch.device('cuda')
+    tp_size = 1 if getattr(module, '_weights_presharded', False) else module.tp_size
+    tp_rank = 0 if getattr(module, '_weights_presharded', False) else module.tp_rank
 
-    weight = load_weight_shard(weights[0]['weight'], module.tp_size,
-                               module.tp_rank, module.tp_mode,
+    weight = load_weight_shard(weights[0]['weight'], tp_size,
+                               tp_rank, module.tp_mode,
                                device) if "weight" in weights[0] else None
 
     if weight is not None:
@@ -201,8 +203,8 @@ def load_weights_vanilla_helper(module: Linear,
         copy_weight(module.weight, weight_transform(weight))
 
     if module.bias is not None:
-        bias = load_weight_shard(weights[0]['bias'], module.tp_size,
-                                 module.tp_rank, module.tp_mode,
+        bias = load_weight_shard(weights[0]['bias'], tp_size,
+                                 tp_rank, module.tp_mode,
                                  device) if "bias" in weights[0] else None
         if bias is not None:
             copy_weight(module.bias, bias_transform(bias))
@@ -224,26 +226,28 @@ def load_weights_fused_qkv_helper(
             module, "fused_weight_shard_indices_mapping", None
         ) is not None, "Fused weight shard indices mapping is required in partial loading"
     device = torch.device('cuda')
+    tp_size = 1 if getattr(module, '_weights_presharded', False) else module.tp_size
+    tp_rank = 0 if getattr(module, '_weights_presharded', False) else module.tp_rank
 
-    q_weight = load_weight_shard(weights[0]['weight'], module.tp_size,
-                                 module.tp_rank, module.tp_mode,
+    q_weight = load_weight_shard(weights[0]['weight'], tp_size,
+                                 tp_rank, module.tp_mode,
                                  device) if "weight" in weights[0] else None
-    k_weight = load_weight_shard(weights[1]['weight'], module.tp_size,
-                                 module.tp_rank, module.tp_mode,
+    k_weight = load_weight_shard(weights[1]['weight'], tp_size,
+                                 tp_rank, module.tp_mode,
                                  device) if "weight" in weights[1] else None
-    v_weight = load_weight_shard(weights[2]['weight'], module.tp_size,
-                                 module.tp_rank, module.tp_mode,
+    v_weight = load_weight_shard(weights[2]['weight'], tp_size,
+                                 tp_rank, module.tp_mode,
                                  device) if "weight" in weights[2] else None
 
     if module.bias is not None:
-        q_bias = load_weight_shard(weights[0]['bias'], module.tp_size,
-                                   module.tp_rank, module.tp_mode,
+        q_bias = load_weight_shard(weights[0]['bias'], tp_size,
+                                   tp_rank, module.tp_mode,
                                    device) if "bias" in weights[0] else None
-        k_bias = load_weight_shard(weights[1]['bias'], module.tp_size,
-                                   module.tp_rank, module.tp_mode,
+        k_bias = load_weight_shard(weights[1]['bias'], tp_size,
+                                   tp_rank, module.tp_mode,
                                    device) if "bias" in weights[1] else None
-        v_bias = load_weight_shard(weights[2]['bias'], module.tp_size,
-                                   module.tp_rank, module.tp_mode,
+        v_bias = load_weight_shard(weights[2]['bias'], tp_size,
+                                   tp_rank, module.tp_mode,
                                    device) if "bias" in weights[2] else None
         if not allow_partial_loading:
             copy_weight(module.bias,
@@ -277,19 +281,21 @@ def load_weights_fused_gate_up_helper(
             module, "fused_weight_shard_indices_mapping", None
         ) is not None, "Fused weight shard indices mapping is required in partial loading"
     device = torch.device('cuda')
+    tp_size = 1 if getattr(module, '_weights_presharded', False) else module.tp_size
+    tp_rank = 0 if getattr(module, '_weights_presharded', False) else module.tp_rank
 
-    gate_weight = load_weight_shard(weights[0]['weight'], module.tp_size,
-                                    module.tp_rank, module.tp_mode,
+    gate_weight = load_weight_shard(weights[0]['weight'], tp_size,
+                                    tp_rank, module.tp_mode,
                                     device) if "weight" in weights[0] else None
-    up_weight = load_weight_shard(weights[1]['weight'], module.tp_size,
-                                  module.tp_rank, module.tp_mode,
+    up_weight = load_weight_shard(weights[1]['weight'], tp_size,
+                                  tp_rank, module.tp_mode,
                                   device) if "weight" in weights[1] else None
     if module.bias is not None:
-        gate_bias = load_weight_shard(weights[0]['bias'], module.tp_size,
-                                      module.tp_rank, module.tp_mode,
+        gate_bias = load_weight_shard(weights[0]['bias'], tp_size,
+                                      tp_rank, module.tp_mode,
                                       device) if "bias" in weights[0] else None
-        up_bias = load_weight_shard(weights[1]['bias'], module.tp_size,
-                                    module.tp_rank, module.tp_mode,
+        up_bias = load_weight_shard(weights[1]['bias'], tp_size,
+                                    tp_rank, module.tp_mode,
                                     device) if "bias" in weights[1] else None
         if not allow_partial_loading:
             copy_weight(module.bias,
@@ -2502,6 +2508,7 @@ def __init__(
         self.use_cute_dsl_blockscaling_mm = use_cute_dsl_blockscaling_mm
         self.disable_deep_gemm = disable_deep_gemm
         self.fused_weight_shard_indices_mapping = fused_weight_shard_indices_mapping
+        self._weights_presharded = False
 
         # Store NVFP4 GEMM allowed backends configuration
         # Read from model_extra_attrs if not explicitly provided (allows config via llm_api_options)