remove dist_wrapper.py

yexin · yexin · commit b93e062e28ff · 2026-01-13T10:49:18.000+08:00
diff --git a/checkpoint_engine/dist_wrapper.py b/checkpoint_engine/dist_wrapper.py
diff --git a/checkpoint_engine/distributed/base.py b/checkpoint_engine/distributed/base.py
@@ -6,7 +6,7 @@
 import importlib
 
 import torch
-from torch.distributed import ReduceOp
+import torch.distributed as torch_dist
 
 
 class Distributed(ABC):
@@ -45,7 +45,7 @@ def all_gather_object(
     def all_reduce(
         self,
         tensor: torch.Tensor,
-        op :ReduceOp,
+        op :torch_dist.ReduceOp,
         group,
     ):
         raise NotImplementedError
@@ -159,13 +159,14 @@ def init_process_group(
 
 def destroy_process_group(group=None):
     if _BACKEND_INSTANCE is None:
-        raise RuntimeError("distribute module not initialized")
+        torch_dist.destroy_process_group(group)
+        return
     _BACKEND_INSTANCE.destroy_process_group(group)
 
 
 def is_initialized() -> bool:
     if _BACKEND_INSTANCE is None:
-        return False
+        return torch_dist.is_initialized()
     return _BACKEND_INSTANCE.is_initialized()
 
 def all_gather_object(
@@ -174,37 +175,43 @@ def all_gather_object(
     group=None,
 ):
     if _BACKEND_INSTANCE is None:
-        raise RuntimeError("distribute module not initialized")
+        torch_dist.all_gather_object(object_list, obj, group)
+        return
     _BACKEND_INSTANCE.all_gather_object(object_list, obj, group)
 
 
 def all_reduce(
     tensor: torch.Tensor,
-    op=ReduceOp.SUM,
+    op=torch_dist.ReduceOp.SUM,
     group=None,
+    **kwargs,
 ):
     if _BACKEND_INSTANCE is None:
-        raise RuntimeError("distribute module not initialized")
+        torch_dist.all_reduce(tensor, op, group, **kwargs)
+        return
     _BACKEND_INSTANCE.all_reduce(tensor, op, group)
 
 
 def broadcast(
     tensor: torch.Tensor,
-    src= None,
+    src=None,
     group=None,
+    **kwargs,
 ):
     if _BACKEND_INSTANCE is None:
-        raise RuntimeError("distribute module not initialized")
+        torch_dist.broadcast(tensor, src, group, **kwargs)
+        return
     _BACKEND_INSTANCE.broadcast(tensor, src, group)
 
 
-def barrier(group=None):
+def barrier(group=None, **kwargs):
     if _BACKEND_INSTANCE is None:
-        raise RuntimeError("distribute module not initialized")
+        torch_dist.barrier(group, **kwargs)
+        return
     _BACKEND_INSTANCE.barrier(group)
 
 
-def new_group(ranks: list[int]):
+def new_group(ranks: list[int], **kwargs):
     if _BACKEND_INSTANCE is None:
-        raise RuntimeError("distribute module not initialized")
+        return torch_dist.new_group(ranks, **kwargs)
     return _BACKEND_INSTANCE.new_group(ranks)
diff --git a/checkpoint_engine/ps.py b/checkpoint_engine/ps.py
@@ -24,7 +24,7 @@
 from checkpoint_engine.device_utils import DeviceManager, get_ip, npu_generate_uuid
 from checkpoint_engine.p2p_store import P2PStore
 from checkpoint_engine.pin_memory import _ALIGN_SIZE, _register_checkpoint
-from checkpoint_engine.dist_wrapper import dist
+import checkpoint_engine.distributed as dist
 
 
 if TYPE_CHECKING:
@@ -176,6 +176,7 @@ def __init__(
         auto_pg: bool = True,
         gpu_count: int | None = None,
         mem_fraction: float | None = None,
+        custom_dist: bool = False,
     ):
         """
         Initialize the parameter server. env RANK, WORLD_SIZE and MASTER_ADDR must be set.
@@ -196,6 +197,7 @@ def __init__(
         self._local_rdma_devices: dict[str, set[int]] = defaultdict(set)
         self._remote_rdma_devices: dict[str, set[int]] = defaultdict(set)
         self._mem_fraction = mem_fraction or float(os.getenv("PS_MEM_FRACTION", "0.9"))
+        self._custom_dist = custom_dist
 
         assert self._rank is not None and self._rank >= 0, self._rank
         assert self._world_size and self._world_size > 0, self._world_size
@@ -491,7 +493,7 @@ def init_process_group(
         """
         master_addr = master_addr or os.getenv("MASTER_ADDR")
         assert master_addr, "master_addr is required"
-        if dist is torch.distributed:
+        if not self._custom_dist:
             store = torch.distributed.TCPStore(
                 master_addr,
                 _get_master_port(master_port),
@@ -518,7 +520,7 @@ def init_process_group(
         logger.info(f"[rank{self._rank}] init process group successfully.")
 
     def store_based_barrier(
-        self, store: dist.TCPStore, timeout: timedelta = timedelta(minutes=5)
+        self, store, timeout: timedelta = timedelta(minutes=5)
     ) -> None:
         """
         Perform a store-based barrier synchronization across all ranks.
@@ -606,7 +608,7 @@ def zmq_handle(device_uuid: str) -> str:
         return socket, socket_paths
 
     def _detect_bucket_size(
-        self, ranks_group: dist.ProcessGroup | None, *, disable_h2d_buffer: bool = False
+        self, ranks_group, *, disable_h2d_buffer: bool = False
     ) -> tuple[int, bool]:
         GiB = 1 << 30  # noqa: N806
         # auto detect bucket size
@@ -725,7 +727,7 @@ def _update_per_bucket(
         self,
         checkpoint_name: str,
         req_func: Callable[[list[tuple[str, str]]], None],
-        ranks_group: dist.ProcessGroup | None,
+        ranks_group,
         ranks: list[int] | None = None,
     ):
         assert len(self._current_global_parameter_metas) != 0, "parameter metas is empty"
diff --git a/examples/update.py b/examples/update.py
@@ -15,7 +15,7 @@
 
 from checkpoint_engine.ps import ParameterServer
 from checkpoint_engine.api import request_inference_to_update
-from checkpoint_engine.dist_wrapper import dist, setup_dist
+import checkpoint_engine.distributed as dist
 
 
 @contextmanager
@@ -164,11 +164,8 @@ def join(
     rank = int(os.getenv("RANK"))
     world_size = int(os.getenv("WORLD_SIZE"))
 
-    if args.custom_dist:
-        setup_dist()
-
     req_func = req_inference(args.endpoint, args.inference_parallel_size, args.uds)
-    ps = ParameterServer(auto_pg=True)
+    ps = ParameterServer(auto_pg=True, custom_dist=args.custom_dist)
     if args.load_metas_file:
         join(
             ps,