|
24 | 24 | from torch.multiprocessing.reductions import reduce_tensor |
25 | 25 |
|
26 | 26 |
|
27 | | -def is_torch_npu_available() -> bool: |
28 | | - try: |
29 | | - if hasattr(torch, "npu") and callable(getattr(torch.npu, "is_available", None)): |
30 | | - return torch.npu.is_available() |
31 | | - else: |
32 | | - return False |
33 | | - except ImportError: |
34 | | - return False |
35 | | - |
36 | 27 | class DeviceManager: |
37 | 28 | def __init__(self): |
38 | 29 | self.device_type = self._detect_device_type() |
39 | 30 | self._setup_device_module() |
40 | 31 |
|
| 32 | + def _is_torch_npu_available(self) -> bool: |
| 33 | + try: |
| 34 | + if hasattr(torch, "npu") and callable(getattr(torch.npu, "is_available", None)): |
| 35 | + return torch.npu.is_available() |
| 36 | + else: |
| 37 | + return False |
| 38 | + except ImportError: |
| 39 | + return False |
| 40 | + |
41 | 41 | def _detect_device_type(self) -> str: |
42 | | - if is_torch_npu_available(): |
| 42 | + if self._is_torch_npu_available(): |
43 | 43 | return "npu" |
44 | 44 | elif torch.cuda.is_available(): |
45 | 45 | return "cuda" |
| 46 | + else: |
| 47 | + raise TypeError("The current device type is not supported") |
46 | 48 |
|
47 | 49 | def _setup_device_module(self): |
48 | 50 | if self.device_type == "npu": |
49 | 51 | import torch_npu |
50 | 52 | self.device_module = torch_npu.npu |
51 | 53 | elif self.device_type == "cuda": |
52 | 54 | self.device_module = torch.cuda |
| 55 | + else: |
| 56 | + raise TypeError("The current device type is not supported") |
53 | 57 |
|
54 | | - def get_backend(self) -> str: |
| 58 | + @property |
| 59 | + def backend(self) -> str: |
55 | 60 | if self.device_type == "npu": |
56 | 61 | return "hccl" |
57 | 62 | elif self.device_type == "cuda": |
@@ -283,10 +288,10 @@ def _concat_tp_weights( |
283 | 288 | return torch.cat([w for w in tp_weights], dim=tp_concat_dim) |
284 | 289 |
|
285 | 290 |
|
286 | | -def _get_physical_gpu_id(device_manager: DeviceManager, device_index: int | None = None) -> str: |
| 291 | +def _get_physical_gpu_id(device_manager: DeviceManager, rank_id: int, device_index: int | None = None) -> str: |
287 | 292 | try: |
288 | 293 | if device_manager.device_type == "npu": |
289 | | - return f"NPU-{device_manager.device_module.get_device_properties(device_index).name!s}-{device_index}" |
| 294 | + return f"NPU-{device_manager.device_module.get_device_properties(device_index).name!s}-{rank_id}" |
290 | 295 | else: |
291 | 296 | return f"GPU-{device_manager.device_module.get_device_properties(device_index).uuid!s}" |
292 | 297 | except AssertionError as e: |
@@ -625,7 +630,7 @@ def _get_master_port(master_port: int | None = None) -> int: |
625 | 630 |
|
626 | 631 |
|
627 | 632 | class P2PStore: |
628 | | - def __init__(self, device_manager: DeviceManager): |
| 633 | + def __init__(self, device_manager : DeviceManager): |
629 | 634 | from mooncake.engine import TransferEngine |
630 | 635 |
|
631 | 636 | self.rank = int(os.getenv("RANK")) |
@@ -742,7 +747,7 @@ def __init__( |
742 | 747 |
|
743 | 748 | device_index = self._local_rank |
744 | 749 | self.device_manager.device_module.set_device(device_index) |
745 | | - self._device_uuid = _get_physical_gpu_id(self.device_manager, device_index) |
| 750 | + self._device_uuid = _get_physical_gpu_id(self.device_manager, self._rank, device_index) |
746 | 751 |
|
747 | 752 | def _logger_rank0(self, msg: str): |
748 | 753 | if self._local_rank == 0: |
@@ -880,7 +885,7 @@ def init_process_group( |
880 | 885 | is_master=self._rank == 0, |
881 | 886 | ) |
882 | 887 | dist.init_process_group( |
883 | | - backend=self.device_manager.get_backend(), |
| 888 | + backend=self.device_manager.backend, |
884 | 889 | world_size=self._world_size, |
885 | 890 | rank=self._rank, |
886 | 891 | timeout=timeout, |
|
0 commit comments