MoonshotAI
diff --git a/‎checkpoint_engine/distributed/base.py‎
Lines changed: 27 additions & 17 deletions b/‎checkpoint_engine/distributed/base.py‎
Lines changed: 27 additions & 17 deletions
diff --git a/‎checkpoint_engine/distributed/hccl.py‎
Lines changed: 33 additions & 15 deletions b/‎checkpoint_engine/distributed/hccl.py‎
Lines changed: 33 additions & 15 deletions
@@ -7,6 +7,8 @@
 
 import torch
 import torch.distributed as torch_dist
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm_ascend.distributed.device_communicators.pyhccl import PyHcclCommunicator
 
 
 class Distributed(ABC):
@@ -24,7 +26,7 @@ def init_process_group(
     @abstractmethod
     def destroy_process_group(
         self,
-        group,
+        group: torch_dist.ProcessGroup | int | None = None,
     ):
         raise NotImplementedError
 
@@ -37,7 +39,7 @@ def all_gather_object(
         self,
         object_list: list[Any],
         obj: Any,
-        group,
+        group: torch_dist.ProcessGroup | int | None = None,
     ):
         raise NotImplementedError
 
@@ -46,7 +48,7 @@ def all_reduce(
         self,
         tensor: torch.Tensor,
         op: torch_dist.ReduceOp,
-        group,
+        group: torch_dist.ProcessGroup | int | None = None,
     ):
         raise NotImplementedError
 
@@ -55,14 +57,14 @@ def broadcast(
         self,
         tensor: torch.Tensor,
         src: int,
-        group,
+        group: torch_dist.ProcessGroup | int | None = None,
     ):
         raise NotImplementedError
 
     @abstractmethod
     def barrier(
         self,
-        group,
+        group: torch_dist.ProcessGroup | int | None = None,
     ):
         raise NotImplementedError
 
@@ -81,7 +83,7 @@ def new_group(
 _unpickler = pickle.Unpickler
 
 
-def _object_to_tensor(obj, device):
+def _object_to_tensor(obj: Any, device: torch.device) -> tuple[torch.Tensor, torch.Tensor]:
     f = io.BytesIO()
     _pickler(f).dump(obj)
     byte_storage = torch.ByteStorage._from_buffer(f.getvalue())
@@ -90,13 +92,15 @@ def _object_to_tensor(obj, device):
     return byte_tensor, local_size
 
 
-def _tensor_to_object(tensor, tensor_size):
+def _tensor_to_object(tensor: torch.Tensor, tensor_size: int) -> Any:
     tensor = tensor.cpu()
     buf = tensor.numpy().tobytes()[:tensor_size]
     return _unpickler(io.BytesIO(buf)).load()
 
 
-def _flatten_for_scatter_gather(tensor_list, copy=False):
+def _flatten_for_scatter_gather(
+    tensor_list: list[torch.Tensor], copy: bool = False
+) -> torch.Tensor:
     if not tensor_list:
         raise RuntimeError("Received an empty list.")
     t = tensor_list[0]
@@ -109,7 +113,13 @@ def _flatten_for_scatter_gather(tensor_list, copy=False):
     return buffer
 
 
-def _common_all_gather_object(comm, device, world_size, object_list, object):
+def _common_all_gather_object(
+    comm: PyNcclCommunicator | PyHcclCommunicator | Any,
+    device: torch.device,
+    world_size: int,
+    object_list: list[Any],
+    object: Any,
+):
     input_tensor, local_size = _object_to_tensor(object, device)
     object_sizes_tensor = torch.empty(world_size, dtype=torch.long, device=device)
     comm.all_gather(object_sizes_tensor, local_size)
@@ -157,7 +167,7 @@ def init_process_group(
     _BACKEND_INSTANCE.init_process_group(host, port, rank, world_size, timeout)
 
 
-def destroy_process_group(group=None):
+def destroy_process_group(group: torch_dist.ProcessGroup | int | None = None):
     if _BACKEND_INSTANCE is None:
         torch_dist.destroy_process_group(group)
         return
@@ -173,7 +183,7 @@ def is_initialized() -> bool:
 def all_gather_object(
     object_list: list[Any],
     obj: Any,
-    group=None,
+    group: torch_dist.ProcessGroup | int | None = None,
 ):
     if _BACKEND_INSTANCE is None:
         torch_dist.all_gather_object(object_list, obj, group)
@@ -183,8 +193,8 @@ def all_gather_object(
 
 def all_reduce(
     tensor: torch.Tensor,
-    op=torch_dist.ReduceOp.SUM,
-    group=None,
+    op: torch_dist.ReduceOp = torch_dist.ReduceOp.SUM,
+    group: torch_dist.ProcessGroup | int | None = None,
     **kwargs,
 ):
     if _BACKEND_INSTANCE is None:
@@ -195,8 +205,8 @@ def all_reduce(
 
 def broadcast(
     tensor: torch.Tensor,
-    src=None,
-    group=None,
+    src: int = 0,
+    group: torch_dist.ProcessGroup | int | None = None,
     **kwargs,
 ):
     if _BACKEND_INSTANCE is None:
@@ -205,14 +215,14 @@ def broadcast(
     _BACKEND_INSTANCE.broadcast(tensor, src, group)
 
 
-def barrier(group=None, **kwargs):
+def barrier(group: torch_dist.ProcessGroup | int | None = None, **kwargs):
     if _BACKEND_INSTANCE is None:
         torch_dist.barrier(group, **kwargs)
         return
     _BACKEND_INSTANCE.barrier(group)
 
 
-def new_group(ranks: list[int], **kwargs):
+def new_group(ranks: list[int], **kwargs) -> torch_dist.ProcessGroup | int | None:
     if _BACKEND_INSTANCE is None:
         return torch_dist.new_group(ranks, **kwargs)
     return _BACKEND_INSTANCE.new_group(ranks)
@@ -1,6 +1,6 @@
 import ctypes
 from datetime import timedelta
-from typing import Any
+from typing import Any, ClassVar
 
 import torch
 from torch.distributed import ReduceOp
@@ -22,7 +22,7 @@
 
 
 class HcclCommConfig(ctypes.Structure):
-    _fields_ = [
+    _fields_: ClassVar[list[tuple[str, Any]]] = [
         ("size", ctypes.c_size_t),
         ("magic_word", ctypes.c_uint32),
         ("version", ctypes.c_uint32),
@@ -81,15 +81,29 @@ class HcclCommConfig(ctypes.Structure):
 ]
 
 
-def hccl_all_gather(self, send_buf, recv_buf, count, data_type, comm, stream):
+def hccl_all_gather(
+    self,  # noqa: ANN001
+    send_buf: buffer_type,
+    recv_buf: buffer_type,
+    count: ctypes.c_uint64,
+    data_type: hcclDataType_t,
+    comm: hcclComm_t,
+    stream: aclrtStream_t,
+):
     self.HCCL_CHECK(
         self._funcs["HcclAllGather"](send_buf, recv_buf, count, data_type, comm, stream)
     )
 
 
 def hccl_create_subcomm_config(
-    self, comm, ranks_size, c_rank_ids, subcomm_id, subcomm_rank, comm_config
-):
+    self,  # noqa: ANN001
+    comm: hcclComm_t,
+    ranks_size: ctypes.c_uint32,
+    c_rank_ids: ctypes.POINTER(ctypes.c_uint32),
+    subcomm_id: ctypes.c_uint64,
+    subcomm_rank: ctypes.c_uint64,
+    comm_config: HcclCommConfig,
+) -> hcclComm_t:
     subcomm = hcclComm_t()
     self.HCCL_CHECK(
         self._funcs["HcclCreateSubCommConfig"](
@@ -112,17 +126,19 @@ def hccl_create_subcomm_config(
 
 
 class PyHcclCommunicatorEx(PyHcclCommunicator):
-    def __init__(self, group, device):
+    def __init__(self, group: StatelessProcessGroup, device: torch.device):
         super().__init__(group, device)
         self.subcomm_id = 1
 
-    def destroy_comm(self, comm=None):
+    def destroy_comm(self, comm: hcclComm_t = None):
         if comm:
             self.hccl.hcclCommDestroy(comm)
         else:
             self.hccl.hcclCommDestroy(self.comm)
 
-    def all_gather(self, out_tensor: torch.Tensor, in_tensor: torch.Tensor, stream=None):
+    def all_gather(
+        self, out_tensor: torch.Tensor, in_tensor: torch.Tensor, stream: torch.npu.Stream = None
+    ) -> torch.Tensor:
         if self.disabled:
             return
         assert in_tensor.device == self.device, (
@@ -141,7 +157,7 @@ def all_gather(self, out_tensor: torch.Tensor, in_tensor: torch.Tensor, stream=N
         )
         return out_tensor
 
-    def create_subcomm(self, ranks):
+    def create_subcomm(self, ranks: list[int]) -> hcclComm_t:
         comm_config = HcclCommConfig(
             size=312,
             magic_word=0xF0F0F0F0,
@@ -214,7 +230,7 @@ def init_process_group(
 
     def destroy_process_group(
         self,
-        group=None,
+        group: int | None = None,
     ):
         assert self.initialized, "not initialized"
 
@@ -232,7 +248,7 @@ def destroy_process_group(
     def is_initialized(self) -> bool:
         return self.initialized
 
-    def all_gather_object(self, object_list: list[Any], obj: Any, group=None):
+    def all_gather_object(self, object_list: list[Any], obj: Any, group: int | None = None):
         assert self.initialized, "not initialized"
 
         if group:
@@ -246,7 +262,9 @@ def all_gather_object(self, object_list: list[Any], obj: Any, group=None):
         if group:
             self.pyhccl.comm = self.comm
 
-    def all_reduce(self, tensor: torch.Tensor, op=ReduceOp.SUM, group=None):
+    def all_reduce(
+        self, tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM, group: int | None = None
+    ):
         assert self.initialized, "not initialized"
 
         if group:
@@ -261,7 +279,7 @@ def all_reduce(self, tensor: torch.Tensor, op=ReduceOp.SUM, group=None):
         if group:
             self.pyhccl.comm = self.comm
 
-    def broadcast(self, tensor: torch.Tensor, src=None, group=None):
+    def broadcast(self, tensor: torch.Tensor, src: int | None = None, group: int | None = None):
         assert self.initialized, "not initialized"
 
         if group:
@@ -280,7 +298,7 @@ def broadcast(self, tensor: torch.Tensor, src=None, group=None):
             self.pyhccl.comm = self.comm
             self.pyhccl.rank = self.rank
 
-    def barrier(self, group=None):
+    def barrier(self, group: int | None = None):
         assert self.initialized, "not initialized"
 
         if group:
@@ -295,7 +313,7 @@ def barrier(self, group=None):
         if group:
             self.pyhccl.comm = self.comm
 
-    def new_group(self, ranks):
+    def new_group(self, ranks: list[int]) -> int:
         assert self.initialized, "not initialized"
 
         # if ranks is None or [], using the world instead