feat: use torch.cuda.get_device_properties() to get device_uuid instead of nvidia-smi -L

weixiao-huang · weixiao-huang · commit 779eb4c4d11d · 2025-09-20T13:01:20.000+08:00
diff --git a/checkpoint_engine/ps.py b/checkpoint_engine/ps.py
@@ -6,7 +6,6 @@
 import pickle
 import random
 import socket
-import subprocess
 import threading
 import time
 from collections import defaultdict
@@ -242,16 +241,8 @@ def _concat_tp_weights(
     return torch.cat([w for w in tp_weights], dim=tp_concat_dim)
 
 
-def _get_physical_gpu_id(rank: int) -> str:
-    result = subprocess.run(["nvidia-smi", "-L"], capture_output=True, text=True)  # noqa: S607
-    if result.returncode != 0:
-        raise ValueError(result.stdout)
-    lines = result.stdout.strip().split("\n")
-    for line in lines:
-        if f"GPU {rank}" in line:
-            uuid = line.split("UUID: ")[1].strip(")")
-            return uuid
-    raise ValueError(f"not found gpu{rank} uuid")
+def _get_physical_gpu_id(device_index: int | None = None) -> str:
+    return f"GPU-{torch.cuda.get_device_properties(device_index).uuid!s}"
 
 
 @lru_cache(maxsize=1)
@@ -610,7 +601,6 @@ def __init__(self, *, auto_pg: bool = False):
         assert self._rank is not None and self._rank >= 0, self._rank
         assert self._world_size and self._world_size > 0, self._world_size
 
-        self._device_uuid = _get_physical_gpu_id(self._local_rank)
         self._zmq_ctx = zmq.Context()
         self._zmq_addr_counter = 0
 
@@ -623,7 +613,9 @@ def __init__(self, *, auto_pg: bool = False):
             logger.warning(f"[rank{self._rank}] fail to initialize p2p store due to {e}")
             self._p2p_store = None
 
-        torch.cuda.set_device(self._local_rank)
+        device_index = self._local_rank
+        torch.cuda.set_device(device_index)
+        self._device_uuid = _get_physical_gpu_id(device_index)
 
     def _logger_rank0(self, msg: str):
         if self._local_rank == 0: