[modify] generate uuid by npu smi info

cuixiaojin · cuixiaojin · commit c7c2d8dfa49b · 2025-10-27T20:55:56.000+08:00
diff --git a/checkpoint_engine/device_utils.py b/checkpoint_engine/device_utils.py
@@ -0,0 +1,68 @@
+import os
+import re
+import socket
+import subprocess
+
+import torch
+
+
+def npu_generate_uuid() -> str:
+    str_pid = str(os.getpid())
+    npu_num = 8
+    try:
+        for npu_id in range(npu_num):
+            cmd = ["npu-smi", "info", "-t", "proc-mem", "-i", str(npu_id)]
+            result = subprocess.run(cmd, check=True, capture_output=True, text=True)  # noqa: S603
+            str_result = str(result.stdout)
+            if str_pid in str_result:
+                # In A3 server, one NPU has two chips.
+                match_chip_count = re.search(r"Chip Count[^\d]*(\d+)", str_result)
+                chip_count = int(match_chip_count.group(1))
+                search_after_pid = str_result[str_result.find(str_pid) + len(str_pid) :]
+                match_chip_id = re.search(r"Chip ID[^\d]*(\d+)", search_after_pid)
+                chip_id = int(match_chip_id.group(1))
+                server_ip = socket.gethostbyname(socket.gethostname())
+                return f"{server_ip}-{npu_id * chip_count + chip_id}"
+        ValueError("The current process is not running on the npu device")
+    except subprocess.CalledProcessError:
+        ValueError("The current process is not running on the npu device")
+
+
+class DeviceManager:
+    def __init__(self):
+        self.device_type = self._detect_device_type()
+        self._setup_device_module()
+
+    def _is_torch_npu_available(self) -> bool:
+        try:
+            if hasattr(torch, "npu") and callable(getattr(torch.npu, "is_available", None)):
+                return torch.npu.is_available()
+            else:
+                return False
+        except ImportError:
+            return False
+
+    def _detect_device_type(self) -> str:
+        if self._is_torch_npu_available():
+            return "npu"
+        elif torch.cuda.is_available():
+            return "cuda"
+        else:
+            raise TypeError("The current device type is not supported")
+
+    def _setup_device_module(self):
+        if self.device_type == "npu":
+            import torch_npu
+
+            self.device_module = torch_npu.npu
+        elif self.device_type == "cuda":
+            self.device_module = torch.cuda
+        else:
+            raise TypeError("The current device type is not supported")
+
+    @property
+    def backend(self) -> str:
+        if self.device_type == "npu":
+            return "hccl"
+        elif self.device_type == "cuda":
+            return "nccl"
diff --git a/checkpoint_engine/ps.py b/checkpoint_engine/ps.py
@@ -23,44 +23,7 @@
 from safetensors.torch import safe_open
 from torch.multiprocessing.reductions import reduce_tensor
 
-
-class DeviceManager:
-    def __init__(self):
-        self.device_type = self._detect_device_type()
-        self._setup_device_module()
-
-    def _is_torch_npu_available(self) -> bool:
-        try:
-            if hasattr(torch, "npu") and callable(getattr(torch.npu, "is_available", None)):
-                return torch.npu.is_available()
-            else:
-                return False
-        except ImportError:
-            return False
-
-    def _detect_device_type(self) -> str:
-        if self._is_torch_npu_available():
-            return "npu"
-        elif torch.cuda.is_available():
-            return "cuda"
-        else:
-            raise TypeError("The current device type is not supported")
-
-    def _setup_device_module(self):
-        if self.device_type == "npu":
-            import torch_npu
-            self.device_module = torch_npu.npu
-        elif self.device_type == "cuda":
-            self.device_module = torch.cuda
-        else:
-            raise TypeError("The current device type is not supported")
-
-    @property
-    def backend(self) -> str:
-        if self.device_type == "npu":
-            return "hccl"
-        elif self.device_type == "cuda":
-            return "nccl"
+from checkpoint_engine.device_utils import DeviceManager, npu_generate_uuid
 
 
 if TYPE_CHECKING:
@@ -288,10 +251,11 @@ def _concat_tp_weights(
     return torch.cat([w for w in tp_weights], dim=tp_concat_dim)
 
 
-def _get_physical_gpu_id(device_manager: DeviceManager, rank_id: int, device_index: int | None = None) -> str:
+def _get_physical_gpu_id(device_manager: DeviceManager, device_index: int | None = None) -> str:
     try:
         if device_manager.device_type == "npu":
-            return f"NPU-{device_manager.device_module.get_device_properties(device_index).name!s}-{rank_id}"
+            serial_number = npu_generate_uuid()
+            return f"NPU-{serial_number}"
         else:
             return f"GPU-{device_manager.device_module.get_device_properties(device_index).uuid!s}"
     except AssertionError as e:
@@ -630,7 +594,7 @@ def _get_master_port(master_port: int | None = None) -> int:
 
 
 class P2PStore:
-    def __init__(self, device_manager : DeviceManager):
+    def __init__(self, device_manager: DeviceManager):
         from mooncake.engine import TransferEngine
 
         self.rank = int(os.getenv("RANK"))
@@ -747,7 +711,7 @@ def __init__(
 
         device_index = self._local_rank
         self.device_manager.device_module.set_device(device_index)
-        self._device_uuid = _get_physical_gpu_id(self.device_manager, self._rank, device_index)
+        self._device_uuid = _get_physical_gpu_id(self.device_manager, device_index)
 
     def _logger_rank0(self, msg: str):
         if self._local_rank == 0:
@@ -961,7 +925,9 @@ def _detect_bucket_size(self, *, disable_h2d_buffer: bool = False) -> tuple[int,
         tensor = torch.tensor(
             [
                 # proportion of current cuda free memory bytes
-                int(float(self.device_manager.device_module.mem_get_info()[0]) * self._mem_fraction),
+                int(
+                    float(self.device_manager.device_module.mem_get_info()[0]) * self._mem_fraction
+                ),
                 # we use negative value to reuse allreduce min operation
                 # for getting the max value of zmq_addr_counter in all ranks
                 -self._zmq_addr_counter,
@@ -1100,7 +1066,9 @@ def _update_per_bucket_p2p(
         dist.barrier()
 
         bucket_size, _ = self._detect_bucket_size(disable_h2d_buffer=True)
-        buffer = torch.empty(bucket_size * 2, dtype=torch.uint8, device=self.device_manager.device_type)
+        buffer = torch.empty(
+            bucket_size * 2, dtype=torch.uint8, device=self.device_manager.device_type
+        )
         ipc_buffer_name = "__ipc_buffer___"
         self._p2p_store.register_named_tensors({ipc_buffer_name: buffer})
         logger.info(
@@ -1190,7 +1158,9 @@ def _update_per_bucket(
                 continue
             owner_rank_buckets.append(bucket)
 
-        buffer = torch.empty(bucket_size * 2, dtype=torch.uint8, device=self.device_manager.device_type)
+        buffer = torch.empty(
+            bucket_size * 2, dtype=torch.uint8, device=self.device_manager.device_type
+        )
         handle = reduce_tensor(buffer)
 
         buckets_by_owner_rank: dict[int, list[H2DBucket]] = defaultdict(list)
diff --git a/checkpoint_engine/worker.py b/checkpoint_engine/worker.py
@@ -4,7 +4,8 @@
 
 import torch
 import zmq
-from .ps import DeviceManager
+
+from checkpoint_engine.device_utils import DeviceManager, npu_generate_uuid
 
 
 def _rebuild_ipc(handle: tuple[Callable, tuple], device_id: int | None = None) -> torch.Tensor:
@@ -105,9 +106,8 @@ def update_weights_from_ipc(self, zmq_handles: dict[str, str]):
         if current_platform.device_type == "gpu":
             device_uuid = current_platform.get_device_uuid(self.device.index)
         elif current_platform.device_type == "npu":
-            device_uuid = (
-                f"NPU-{current_platform.get_device_name(self.device.index)!s}-{self.rank}"
-            )
+            serial_number = npu_generate_uuid()
+            device_uuid = f"NPU-{serial_number}"
         update_weights_from_ipc(
             self._zmq_ctx,
             zmq_handles[device_uuid],