Merge branch 'main' into fix/tcp_store_group

specture724 · web-flow · commit 3b6334fedf46 · 2025-12-01T13:47:46.000+08:00
Signed-off-by: Anjie Hou &lt;149605198+specture724@users.noreply.github.com&gt;
diff --git a/checkpoint_engine/ps.py b/checkpoint_engine/ps.py
@@ -665,14 +665,23 @@ def __init__(self, device_manager: DeviceManager):
         self.rank = int(os.getenv("RANK"))
         gpu_count = device_manager.device_module.device_count()
         local_rank = self.rank % gpu_count
-        self.device = _get_my_rdma_device(local_rank, gpu_count, _get_rdma_devices())
+        device_type = device_manager.device_type
+        if device_type == "npu" and os.getenv("PS_P2P_STORE_RDMA_DEVICES") is None:
+            self.device = ""
+        else:
+            self.device = _get_my_rdma_device(local_rank, gpu_count, _get_rdma_devices())
         self.ip = get_ip()
 
         # we will start at most 8 ps processes, so we use 8 retries to avoid port conflicts in extreme cases
         retry_count = 8
         for i in range(retry_count):
             self.engine = TransferEngine()
-            ret = self.engine.initialize(self.ip, "P2PHANDSHAKE", "rdma", self.device)
+            ret = self.engine.initialize(
+                self.ip,
+                "P2PHANDSHAKE",
+                "ascend_direct" if device_type == "npu" else "rdma",
+                self.device,
+            )
             if ret == 0:
                 break
             # sleep 0.5 ~ 2.0s, to avoid port conflicts when two processes retry at the same time
@@ -770,14 +779,15 @@ def __init__(
         self._memory_pool: dict[str, list[MemoryBuffer]] = {}
         # dict key is owner_rank, value is a bucket metas list in owner_rank
         self._current_global_parameter_metas: dict[int, MemoryBufferMetaList] = {}
+        # NPU transfer engine initialization requires prior set_device.
+        device_index = self._local_rank
+        self.device_manager.device_module.set_device(device_index)
         try:
             self._p2p_store = P2PStore(self.device_manager)
         except ImportError as e:
             logger.warning(f"[rank{self._rank}] fail to initialize p2p store due to {e}")
             self._p2p_store = None
 
-        device_index = self._local_rank
-        self.device_manager.device_module.set_device(device_index)
         self._device_uuid = _get_physical_gpu_id(self.device_manager, device_index)
         self._rdma_device = None if self._p2p_store is None else self._p2p_store.device
 
@@ -875,6 +885,8 @@ def gather_metas(self, checkpoint_name: str):
 
         dist.all_gather_object(metas_lst, metas)
 
+        self._current_global_parameter_metas = {}
+
         num_parameters = 0
         all_hosts: list[str] = []
         global_device_uuids: list[str] = []
diff --git a/docs/npu_start.md b/docs/npu_start.md
@@ -0,0 +1,91 @@
+# Getting start in ascend
+
+## Overview
+
+Due to hardware differences in Ascend devices, the method for running the Checkpoint Engine on Ascend platforms requires specific adaptations.
+
+## Environment
+
+To support features like IPC Buffer and Transfer Engine, the following Ascend software versions are required:
+
+| Software    | version     |
+|-------------|-------------|
+| Ascend HDK  | \>=25.3.rc1 |
+| cann        | \>=8.3.RC1  | <!-- codespell:ignore -->
+| python      | 3.11        |
+| torch       | 2.7.1       |
+| torch_npu   | 2.7.1       |
+| vllm        | 0.11.0      |
+| vllm_ascend | 0.11.0rc0   |
+
+## Installation
+
+Install from src:
+```shell
+pip install -e .
+```
+Using the flexible P2P implementation requires installation of the Transfer Engine. However, ascend device cannot install transfer engine via pip, requires source compilation.
+
+Reference document: [Ascend Direct Transport documentation](https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/ascend_direct_transport.md)
+
+
+## Deploy vLLM Service
+
+Since HCCL uses the default port 16666, when executing single-device multi-process tasks, you need to manually assign port to the processes.
+Additionally, the underlying HIXL used by the Transfer Engine also defaults to port 16666 during link establishment, and currently there is no interface to modify this. Therefore, when Deploying vLLM serve, you must manually specify the port for the device via the ranktable file.
+
+**ranktable file example:**
+```
+{
+    "version": "1.0",
+    "server_count": "2",
+    "server_list": [
+        {
+            "server_id": "server1",
+            "device": [
+                {
+                    "device_id": "0",
+                    "device_ip": "ip1",
+                    "device_port": "23333", // Choose an available port other than 16666
+                    "rank_id": "0"
+                },
+                {
+                    "device_id": "1",
+                    "device_ip": "ip2",
+                    "device_port": "23333",
+                    "rank_id": "1"
+                }...
+            ]
+        },
+        {
+            "server_id": "server2",
+            "device": [
+                {
+                    "device_id": "0",
+                    "device_ip": "ip8",
+                    "device_port": "23333",
+                    "rank_id": "8"
+                }...
+            ]
+        }...
+    ]
+}
+```
+
+Set the `RANK_TABLE_FILE` environment variable when starting vLLM.
+```shell
+RANK_TABLE_FILE=ranktable.json VLLM_SERVER_DEV_MODE=1 python3 -m vllm.entrypoints.openai.api_server \
+    --host 0.0.0.0 --port 19730 --trust-remote-code --tensor-parallel-size=8 --max-model-len 4096 \
+    --load-format dummy --served-model-name checkpoint-engine-demo \
+    --model /opt/models/Qwen/Qwen3-235B-A22B-Instruct-2507/ \
+    --worker-extension-cls checkpoint_engine.worker.VllmColocateWorkerExtension
+```
+
+The command to start the Checkpoint Engine remains the same.
+```shell
+torchrun --nproc-per-node 8 --log_dir=$(pwd)/logs --redirect 3 examples/update.py --update-method all --checkpoint-path /opt/models/Qwen/Qwen3-235B-A22B-Instruct-2507/
+```
+
+## Important Notes
+
+1. Set the `ASCEND_RT_VISIBLE_DEVICES` environment variable according to the actual NPUs in use. Failure to do so will cause host quantity validation to fail in P2P mode.