Merge remote-tracking branch 'upstream/main' into main

kip-cxj · kip-cxj · commit 6ad7671bc44f · 2026-01-29T19:49:52.000+08:00
diff --git a/checkpoint_engine/distributed/base.py b/checkpoint_engine/distributed/base.py
@@ -103,23 +103,21 @@ def new_group(
 class TorchBackend(Distributed):
     def init_process_group(
         self,
-        host: str,
-        port: int,
         rank: int,
         world_size: int,
+        store: torch.distributed.TCPStore,
         timeout: timedelta,
         **kwargs,
     ):
         backend = kwargs.get("backend", "nccl")
-        store = torch.distributed.TCPStore(
-            host, port, world_size, timeout=timeout, is_master=(rank == 0)
-        )
+        store_counter = kwargs.get("store_counter", "nccl")
+        sub_store = torch.distributed.PrefixStore(f"prefix-{store_counter}", store)
         torch.distributed.init_process_group(
             backend=backend,
             world_size=world_size,
             rank=rank,
             timeout=timeout,
-            store=store,
+            store=sub_store,
         )
 
     def destroy_process_group(self, group: DistributedProcessGroup | None = None):
@@ -243,14 +241,13 @@ def use_backend(backend: str | None):
 
 
 def init_process_group(
-    host: str,
-    port: int,
     rank: int,
     world_size: int,
+    store: torch.distributed.TCPStore,
     timeout: timedelta = timedelta(seconds=300),
     **kwargs,
 ):
-    _BACKEND_INSTANCE.init_process_group(host, port, rank, world_size, timeout, **kwargs)
+    _BACKEND_INSTANCE.init_process_group(rank, world_size, store, timeout, **kwargs)
 
 
 def destroy_process_group(group: DistributedProcessGroup | None = None):
diff --git a/checkpoint_engine/distributed/hccl.py b/checkpoint_engine/distributed/hccl.py
@@ -230,23 +230,22 @@ def _use_group(self, group: CommGroup | None, src: int | None = None):
 
     def init_process_group(
         self,
-        host: str,
-        port: int,
         rank: int,
         world_size: int,
+        store: torch.distributed.TCPStore,
         timeout: timedelta = timedelta(seconds=300),
         **kwargs,
     ):
         assert not self.initialized, "already initialized"
 
-        self.host = host
-        self.port = port
+        self.host = store.host
+        self.port = store.port + 1
         self.rank = rank
         self.world_size = world_size
         self.device = torch.device("npu", torch.npu.current_device())
 
         self.pg = StatelessProcessGroup.create(
-            host, port, rank, world_size, store_timeout=int(timeout.total_seconds())
+            self.host, self.port, rank, world_size, store_timeout=int(timeout.total_seconds())
         )
         self.pyhccl = PyHcclCommunicatorEx(group=self.pg, device=self.device)
         self.comm = self.pyhccl.comm
diff --git a/checkpoint_engine/distributed/nccl.py b/checkpoint_engine/distributed/nccl.py
@@ -133,23 +133,22 @@ def _use_group(self, group: CommGroup | None, src: int | None = None):
 
     def init_process_group(
         self,
-        host: str,
-        port: int,
         rank: int,
         world_size: int,
+        store: torch.distributed.TCPStore,
         timeout: timedelta = timedelta(seconds=300),
         **kwargs,
     ):
         assert not self.initialized, "already initialized"
 
-        self.host = host
-        self.port = port
+        self.host = store.host
+        self.port = store.port + 1
         self.rank = rank
         self.world_size = world_size
         self.device = torch.device("cuda", torch.cuda.current_device())
 
         self.pg = StatelessProcessGroup.create(
-            host, port, rank, world_size, store_timeout=int(timeout.total_seconds())
+            self.host, self.port, rank, world_size, store_timeout=int(timeout.total_seconds())
         )
 
         self.pynccl = PyNcclCommunicatorEx(group=self.pg, device=self.device)
diff --git a/checkpoint_engine/pin_memory.py b/checkpoint_engine/pin_memory.py
@@ -209,7 +209,9 @@ def _pin(t: torch.Tensor):
             torch.cuda.set_device(device_index)
             cudart = torch.cuda.cudart()
             r = cudart.cudaHostRegister(t.data_ptr(), t.numel() * t.element_size(), 0)
-            assert r == 0, f"pin memory error, error code: {r}"
+            if r != 0:
+                error_msg = cudart.cudaGetErrorString(r)
+                raise RuntimeError(f"pin memory error, error code: {r}, error message: {error_msg}")
 
         # TODO: should only support /dev/shm? but we found files in disk also work?
         size = os.stat(file_path).st_size
@@ -254,6 +256,12 @@ def _pin(t: torch.Tensor):
         # Remove the file after successfully loading. This will avoid doubling the memory usage.
         # We assume files in /dev/shm/ are temporary files. So it's safe to remove them after loading.
         os.remove(file_path)
+        if not metas:
+            # TODO: should we still return this buffer?
+            assert buffer.nbytes == 0, f"buffer nbytes {buffer.nbytes} should be 0"
+            logger.warning(f"[rank{rank}] no metas found in {file_path}, skip pin memory")
+            return MemoryBuffer(buffer=buffer, size=buffer.nbytes, metas=[], manually_pinned=False)
+
         _pin(buffer)
         logger.info(
             f"[rank{rank}] inplace pin memory for file {file_path} finished, size {buffer.nbytes / 1024 / 1024:.2f}MiB"
diff --git a/checkpoint_engine/ps.py b/checkpoint_engine/ps.py
@@ -176,6 +176,8 @@ def __init__(
         auto_pg: bool = True,
         gpu_count: int | None = None,
         mem_fraction: float | None = None,
+        master_addr: str | None = None,
+        master_port: int | None = None,
     ):
         """
         Initialize the parameter server. env RANK, WORLD_SIZE and MASTER_ADDR must be set.
@@ -229,6 +231,17 @@ def __init__(
         self._device_uuid = _get_physical_gpu_id(self.device_manager, device_index)
         self._rdma_device = None if self._p2p_store is None else self._p2p_store.device
 
+        master_addr = master_addr or os.getenv("MASTER_ADDR")
+        assert master_addr, "master_addr is required"
+        self._store = torch.distributed.TCPStore(
+            master_addr,
+            _get_master_port(master_port),
+            self._world_size,
+            timeout=timedelta(minutes=10),
+            is_master=self._rank == 0,
+        )
+        self._store_counter = 0
+
     def _get_memory_pool(self, checkpoint_name: str) -> list[MemoryBuffer]:
         if checkpoint_name == self._current_shared_memory_pool_user:
             assert self._memory_pool[self.shared_memory_pool_name], (
@@ -392,7 +405,11 @@ def _unpin(t: torch.Tensor):
                 )
                 cudart = torch.cuda.cudart()
                 r = cudart.cudaHostUnregister(t.data_ptr())
-                assert r == 0, f"unpin memory error, error code: {r}"
+                if r != 0:
+                    error_msg = cudart.cudaGetErrorString(r)
+                    raise RuntimeError(
+                        f"unpin memory error, error code: {r}, error message: {error_msg}"
+                    )
 
             # if the checkpoint is pinned by cudaHostRegister manually, we need to unpin it manually
             try:
@@ -408,7 +425,13 @@ def _unpin(t: torch.Tensor):
             del self._memory_pool[checkpoint_name]
         # see https://github.com/pytorch/pytorch/blob/31d5c675394705f8a6bc767f80ae14bf4f01246b/torch/csrc/cuda/Module.cpp#L2018
         # this works by using torch>=2.5.0
-        torch._C._host_emptyCache()
+        if self.device_manager.device_type == "cuda":
+            torch._C._host_emptyCache()
+        else:
+            # torch._C._host_emptyCache() is not supported on NPU, so we call gc.collect() to empty host cache.
+            import gc
+
+            gc.collect()
 
     def gather_metas(self, checkpoint_name: str):
         """
@@ -478,8 +501,6 @@ def gather_metas(self, checkpoint_name: str):
     def init_process_group(
         self,
         *,
-        master_addr: str | None = None,
-        master_port: int | None = None,
         timeout: timedelta = timedelta(minutes=10),
     ):
         """
@@ -489,21 +510,18 @@ def init_process_group(
             master_port: The specified port of the master node. If not set, will use _get_master_port to get the port.
             timeout: The timeout of the process group.
         """
-        master_addr = master_addr or os.getenv("MASTER_ADDR")
-        assert master_addr, "master_addr is required"
+        self._store_counter += 1
         dist.init_process_group(
-            host=master_addr,
-            port=_get_master_port(master_port),
             rank=self._rank,
             world_size=self._world_size,
+            store=self._store,
             timeout=timeout,
             backend=self.device_manager.backend,
+            store_counter=self._store_counter,
         )
         logger.info(f"[rank{self._rank}] init process group successfully.")
 
-    def store_based_barrier(
-        self, store: torch.distributed.TCPStore, timeout: timedelta = timedelta(minutes=5)
-    ) -> None:
+    def store_based_barrier(self, timeout: timedelta = timedelta(minutes=5)) -> None:
         """
         Perform a store-based barrier synchronization across all ranks.
 
@@ -516,7 +534,7 @@ def store_based_barrier(
         """
         torch.distributed.distributed_c10d._store_based_barrier(
             rank=self._rank,
-            store=store,
+            store=self._store,
             group_name="parameter_server_barrier",
             rendezvous_count=self._world_size,
             timeout=timeout,
@@ -529,8 +547,6 @@ def update(
         *,
         timeout: timedelta = timedelta(minutes=10),
         ranks: list[int] | None = None,
-        master_addr: str | None = None,
-        master_port: int | None = None,
     ) -> None:
         """
         Update the checkpoint to inference engine. This function should be called after gather_metas.
@@ -551,25 +567,12 @@ def update(
         assert req_func is not None, "req_func is required"
         ranks_group = None
         try:
-            master_addr = os.getenv("MASTER_ADDR") or master_addr
-            assert master_addr, "master_addr is required"
             if self._auto_pg and not dist.is_initialized():
-                self.init_process_group(
-                    timeout=timeout, master_addr=master_addr, master_port=master_port
-                )
-            # HACK: MASTER_PORT+2 for barrier store if master_port is not provided, _get_master_port() returns MASTER_PORT+1
-            # If master_port is provided, use master_port+1 for barrier store
-            manager_store = torch.distributed.TCPStore(
-                master_addr,
-                _get_master_port(master_port) + 1,
-                self._world_size,
-                timeout=timeout,
-                is_master=self._rank == 0,
-            )
+                self.init_process_group(timeout=timeout)
             # if ranks is None or [], it will use fully broadcast to update to all ranks
             ranks_group = dist.new_group(ranks) if ranks else None
             self._update_per_bucket(checkpoint_name, req_func, ranks_group, ranks)
-            self.store_based_barrier(manager_store)
+            self.store_based_barrier()
         except Exception as e:
             logger.exception(
                 f"[rank{self._rank}] update checkpoint {checkpoint_name} with ranks {ranks} error {e}"
@@ -580,7 +583,6 @@ def update(
                 dist.destroy_process_group(ranks_group)
             if self._auto_pg and dist.is_initialized():
                 dist.destroy_process_group()
-            del manager_store
             self.device_manager.device_module.empty_cache()
             logger.info(
                 f"[rank{self._rank}] update checkpoint {checkpoint_name} with ranks {ranks} done. "
diff --git a/checkpoint_engine/worker.py b/checkpoint_engine/worker.py
@@ -10,6 +10,9 @@
 from checkpoint_engine.device_utils import DeviceManager, npu_generate_uuid
 
 
+_WEIGHTS_TYPE = list[tuple[str, torch.Tensor]]
+
+
 def _rebuild_ipc(handle: tuple[Callable, tuple], device_id: int | None = None) -> torch.Tensor:
     func, args = handle
     list_args = list(args)
@@ -29,11 +32,9 @@ class FlattenedTensorMetadata(TypedDict):
     offset: int
 
 
-def _extract_weights(
-    payload: list[FlattenedTensorMetadata], buffer: torch.Tensor
-) -> list[tuple[str, torch.Tensor]]:
+def _extract_weights(payload: list[FlattenedTensorMetadata], buffer: torch.Tensor) -> _WEIGHTS_TYPE:
     assert buffer is not None
-    weights: list[tuple[str, torch.Tensor]] = []
+    weights: _WEIGHTS_TYPE = []
     for item in payload:
         shape = item["shape"]
         if isinstance(shape, list | tuple):
@@ -166,12 +167,31 @@ def update_weights_from_ipc(self, zmq_handles: dict[str, str]):
             self.device = torch.device(f"npu:{self.local_rank}")
         assert self.device is not None
 
+        def _load_weights(weights: _WEIGHTS_TYPE):
+            # Load main model weights
+            self.model_runner.model.load_weights(weights)
+            # Load drafter model weights if MTP/speculative decoding is enabled
+            if (
+                getattr(self.model_runner, "drafter", None) is not None
+                and getattr(self.model_runner.drafter, "model", None) is not None
+            ):
+                self.model_runner.drafter.model.load_weights(weights=weights)
+
+        def _post_hook():
+            process_weights_after_loading(self.model_runner.model, self.model_config, self.device)
+            # Also trigger drafter model's post processing if MTP is enabled
+            if (
+                getattr(self.model_runner, "drafter", None) is not None
+                and getattr(self.model_runner.drafter, "model", None) is not None
+            ):
+                process_weights_after_loading(
+                    self.model_runner.drafter.model, self.model_config, self.device
+                )
+
         update_weights_from_ipc(
             self._zmq_ctx,
             zmq_handles[self._device_uuid],
             device_id=self.device.index,
-            run=self.model_runner.model.load_weights,
-            post_hook=lambda: process_weights_after_loading(
-                self.model_runner.model, self.model_config, self.device
-            ),
+            run=_load_weights,
+            post_hook=_post_hook,
         )
diff --git a/tests/test_reuse_pin_memory.py b/tests/test_reuse_pin_memory.py
@@ -23,6 +23,8 @@ def generate_dummy_checkpoint() -> dict[str, torch.Tensor]:
 def test_register_pin_memory():
     os.environ["RANK"] = "0"
     os.environ["WORLD_SIZE"] = "1"
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "25400"
     ps = ParameterServer()
     checkpoint1 = generate_dummy_checkpoint()
     checkpoint_shared1 = generate_dummy_checkpoint()