Skip to content

Commit 3b6334f

Browse files
authored
Merge branch 'main' into fix/tcp_store_group
Signed-off-by: Anjie Hou <149605198+specture724@users.noreply.github.com>
2 parents de1cf1d + 279a908 commit 3b6334f

File tree

2 files changed

+107
-4
lines changed

2 files changed

+107
-4
lines changed

checkpoint_engine/ps.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -665,14 +665,23 @@ def __init__(self, device_manager: DeviceManager):
665665
self.rank = int(os.getenv("RANK"))
666666
gpu_count = device_manager.device_module.device_count()
667667
local_rank = self.rank % gpu_count
668-
self.device = _get_my_rdma_device(local_rank, gpu_count, _get_rdma_devices())
668+
device_type = device_manager.device_type
669+
if device_type == "npu" and os.getenv("PS_P2P_STORE_RDMA_DEVICES") is None:
670+
self.device = ""
671+
else:
672+
self.device = _get_my_rdma_device(local_rank, gpu_count, _get_rdma_devices())
669673
self.ip = get_ip()
670674

671675
# we will start at most 8 ps processes, so we use 8 retries to avoid port conflicts in extreme cases
672676
retry_count = 8
673677
for i in range(retry_count):
674678
self.engine = TransferEngine()
675-
ret = self.engine.initialize(self.ip, "P2PHANDSHAKE", "rdma", self.device)
679+
ret = self.engine.initialize(
680+
self.ip,
681+
"P2PHANDSHAKE",
682+
"ascend_direct" if device_type == "npu" else "rdma",
683+
self.device,
684+
)
676685
if ret == 0:
677686
break
678687
# sleep 0.5 ~ 2.0s, to avoid port conflicts when two processes retry at the same time
@@ -770,14 +779,15 @@ def __init__(
770779
self._memory_pool: dict[str, list[MemoryBuffer]] = {}
771780
# dict key is owner_rank, value is a bucket metas list in owner_rank
772781
self._current_global_parameter_metas: dict[int, MemoryBufferMetaList] = {}
782+
# NPU transfer engine initialization requires prior set_device.
783+
device_index = self._local_rank
784+
self.device_manager.device_module.set_device(device_index)
773785
try:
774786
self._p2p_store = P2PStore(self.device_manager)
775787
except ImportError as e:
776788
logger.warning(f"[rank{self._rank}] fail to initialize p2p store due to {e}")
777789
self._p2p_store = None
778790

779-
device_index = self._local_rank
780-
self.device_manager.device_module.set_device(device_index)
781791
self._device_uuid = _get_physical_gpu_id(self.device_manager, device_index)
782792
self._rdma_device = None if self._p2p_store is None else self._p2p_store.device
783793

@@ -875,6 +885,8 @@ def gather_metas(self, checkpoint_name: str):
875885

876886
dist.all_gather_object(metas_lst, metas)
877887

888+
self._current_global_parameter_metas = {}
889+
878890
num_parameters = 0
879891
all_hosts: list[str] = []
880892
global_device_uuids: list[str] = []

docs/npu_start.md

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# Getting start in ascend
2+
3+
## Overview
4+
5+
Due to hardware differences in Ascend devices, the method for running the Checkpoint Engine on Ascend platforms requires specific adaptations.
6+
7+
## Environment
8+
9+
To support features like IPC Buffer and Transfer Engine, the following Ascend software versions are required:
10+
11+
| Software | version |
12+
|-------------|-------------|
13+
| Ascend HDK | \>=25.3.rc1 |
14+
| cann | \>=8.3.RC1 | <!-- codespell:ignore -->
15+
| python | 3.11 |
16+
| torch | 2.7.1 |
17+
| torch_npu | 2.7.1 |
18+
| vllm | 0.11.0 |
19+
| vllm_ascend | 0.11.0rc0 |
20+
21+
## Installation
22+
23+
Install from src:
24+
```shell
25+
pip install -e .
26+
```
27+
Using the flexible P2P implementation requires installation of the Transfer Engine. However, ascend device cannot install transfer engine via pip, requires source compilation.
28+
29+
Reference document: [Ascend Direct Transport documentation](https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/ascend_direct_transport.md)
30+
31+
32+
## Deploy vLLM Service
33+
34+
Since HCCL uses the default port 16666, when executing single-device multi-process tasks, you need to manually assign port to the processes.
35+
Additionally, the underlying HIXL used by the Transfer Engine also defaults to port 16666 during link establishment, and currently there is no interface to modify this. Therefore, when Deploying vLLM serve, you must manually specify the port for the device via the ranktable file.
36+
37+
**ranktable file example:**
38+
```
39+
{
40+
"version": "1.0",
41+
"server_count": "2",
42+
"server_list": [
43+
{
44+
"server_id": "server1",
45+
"device": [
46+
{
47+
"device_id": "0",
48+
"device_ip": "ip1",
49+
"device_port": "23333", // Choose an available port other than 16666
50+
"rank_id": "0"
51+
},
52+
{
53+
"device_id": "1",
54+
"device_ip": "ip2",
55+
"device_port": "23333",
56+
"rank_id": "1"
57+
}...
58+
]
59+
},
60+
{
61+
"server_id": "server2",
62+
"device": [
63+
{
64+
"device_id": "0",
65+
"device_ip": "ip8",
66+
"device_port": "23333",
67+
"rank_id": "8"
68+
}...
69+
]
70+
}...
71+
]
72+
}
73+
```
74+
75+
Set the `RANK_TABLE_FILE` environment variable when starting vLLM.
76+
```shell
77+
RANK_TABLE_FILE=ranktable.json VLLM_SERVER_DEV_MODE=1 python3 -m vllm.entrypoints.openai.api_server \
78+
--host 0.0.0.0 --port 19730 --trust-remote-code --tensor-parallel-size=8 --max-model-len 4096 \
79+
--load-format dummy --served-model-name checkpoint-engine-demo \
80+
--model /opt/models/Qwen/Qwen3-235B-A22B-Instruct-2507/ \
81+
--worker-extension-cls checkpoint_engine.worker.VllmColocateWorkerExtension
82+
```
83+
84+
The command to start the Checkpoint Engine remains the same.
85+
```shell
86+
torchrun --nproc-per-node 8 --log_dir=$(pwd)/logs --redirect 3 examples/update.py --update-method all --checkpoint-path /opt/models/Qwen/Qwen3-235B-A22B-Instruct-2507/
87+
```
88+
89+
## Important Notes
90+
91+
1. Set the `ASCEND_RT_VISIBLE_DEVICES` environment variable according to the actual NPUs in use. Failure to do so will cause host quantity validation to fail in P2P mode.

0 commit comments

Comments
 (0)