Skip to content

Commit 751e9a7

Browse files
committed
main2main
Signed-off-by: wangli <[email protected]>
1 parent bb7b74c commit 751e9a7

File tree

17 files changed

+147
-40
lines changed

17 files changed

+147
-40
lines changed

.github/workflows/pr_test_full.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ jobs:
7474
name: e2e-full
7575
strategy:
7676
matrix:
77-
vllm_version: [ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9, v0.12.0]
77+
vllm_version: [b75f826fca4febb17a76c12a45d5e315111c7618, v0.12.0]
7878
needs: [changes]
7979
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
8080
uses: ./.github/workflows/_e2e_test.yaml

.github/workflows/pr_test_light.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ jobs:
4242
lint:
4343
uses: ./.github/workflows/_pre_commit.yml
4444
with:
45-
vllm: ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
45+
vllm: b75f826fca4febb17a76c12a45d5e315111c7618
4646
changes:
4747
runs-on: linux-aarch64-a2-0
4848
outputs:
@@ -90,7 +90,7 @@ jobs:
9090
SOC_VERSION: ascend910b1
9191
strategy:
9292
matrix:
93-
vllm_version: [ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9, v0.12.0]
93+
vllm_version: [b75f826fca4febb17a76c12a45d5e315111c7618, v0.12.0]
9494

9595
steps:
9696
- name: Free up disk space
@@ -154,7 +154,7 @@ jobs:
154154
name: e2e-light
155155
strategy:
156156
matrix:
157-
vllm_version: [ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9, v0.12.0]
157+
vllm_version: [b75f826fca4febb17a76c12a45d5e315111c7618, v0.12.0]
158158
# Note (yikun): If CI resource are limited we can split job into two chain jobs
159159
needs: [lint, changes]
160160
# only trigger e2e test after lint passed and the change is e2e related with pull request.

docs/source/community/versioning_policy.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ The table below is the release compatibility matrix for vLLM Ascend release.
4545
For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
4646
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
4747
|-------------|--------------|------------------|-------------|--------------------|
48-
| main | ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9, v0.12.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |
48+
| main | b75f826fca4febb17a76c12a45d5e315111c7618, v0.12.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |
4949

5050
## Release cadence
5151

tests/e2e/multicard/test_shared_expert_dp.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
MODELS = [
1010
"deepseek-ai/DeepSeek-V2-Lite",
11+
"deepseek-ai/DeepSeek-V2-Lite",
1112
]
1213
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
1314

tests/ut/compilation/test_acl_graph.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -803,7 +803,9 @@ def test_update_mla_dcp_pcp_params(self, _mock_graph_task_end):
803803
(q_nope, q_pe, k_nope, k_pe, block_table, seq_lens, num_heads,
804804
scale, num_kv_heads, out, lse))
805805

806-
update_mla_attn_dcp_pcp_params(self.update_stream, forward_context, 4)
806+
with patch("torch_npu._C._npu_setStream", return_value=None):
807+
update_mla_attn_dcp_pcp_params(self.update_stream, forward_context,
808+
4)
807809

808810
_mock_graph_task_end.assert_called_once()
809811

@@ -842,6 +844,7 @@ def test_update_attn_dcp_pcp_params(self, _mock_graph_task_end):
842844
block_table, 128, actual_seq_lengths_kv, actual_seq_lengths_q,
843845
out, lse, 2, 0, 0))
844846

845-
update_attn_dcp_pcp_params(self.update_stream, forward_context, 4)
847+
with patch("torch_npu._C._npu_setStream", return_value=None):
848+
update_attn_dcp_pcp_params(self.update_stream, forward_context, 4)
846849

847850
_mock_graph_task_end.assert_called_once()

tests/ut/spec_decode/test_eagle_proposer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ def test_load_model_pp1(self, mock_pp_group, mock_get_model,
9595
mock_model = MagicMock()
9696
mock_model.model.embed_tokens = MagicMock()
9797
mock_model.lm_head = MagicMock()
98+
mock_model.multimodal_cpu_fields = None
99+
mock_model.merge_by_field_config = None
98100
mock_get_model.return_value = MagicMock()
99101
self.proposer.name = SpecDcodeType.EAGLE
100102

@@ -117,6 +119,8 @@ def test_load_model_pp_gt1(self, mock_pp_group, mock_get_model,
117119

118120
mock_model = MagicMock()
119121
original_embed = MagicMock()
122+
mock_model.multimodal_cpu_fields = None
123+
mock_model.merge_by_field_config = None
120124
mock_get_model.return_value = MagicMock(model=MagicMock(
121125
embed_tokens=original_embed))
122126

vllm_ascend/eplb/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525

2626
def get_expert_map(self, layer_id):
27-
return self.model.layers[layer_id].mlp.experts.get_map()
27+
return self.model.layers[layer_id].mlp.experts.expert_map
2828

2929

3030
def get_log2phy_map(self, layer_id):

vllm_ascend/ops/fused_moe/fused_moe.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def __init__(self, *args, **kwargs):
153153
AscendFusedMoE.moe_counter += 1
154154
self.moe_instance_id = AscendFusedMoE.moe_counter
155155

156-
self.expert_map = None
156+
self._expert_map = None
157157
self.log2phy = None
158158

159159
if self.quant_config is None:
@@ -184,7 +184,7 @@ def __init__(self, *args, **kwargs):
184184
dtype=vllm_config.model_config.dtype)
185185

186186
# init moe.
187-
self.local_num_experts, self.expert_map, _ = determine_expert_map(
187+
self.local_num_experts, self._expert_map, _ = determine_expert_map(
188188
self.ep_size, self.ep_rank, self.global_num_experts)
189189
# TODO: Temporary flag to indicate if static EPLB is enabled. This is a
190190
# workaround to bypass a quantization check that fails with float weights.
@@ -200,7 +200,7 @@ def __init__(self, *args, **kwargs):
200200
self.expert_load_balancer.get_global_redundant_expert_num())
201201
self.global_num_experts = num_experts + self.global_redundant_expert_num
202202
try:
203-
self.local_num_experts, self.expert_map = (
203+
self.local_num_experts, self._expert_map = (
204204
self.expert_load_balancer.get_rank_placement_map(
205205
self.moe_instance_id, self.ep_rank))
206206
self.log2phy = self.expert_load_balancer.get_rank_log2phy_map(
@@ -216,16 +216,16 @@ def __init__(self, *args, **kwargs):
216216
if self.dynamic_eplb:
217217
self.log2phy = determine_default_log2phy_map(
218218
self.global_num_experts, self.ep_size, self.ep_rank).npu()
219-
if self.expert_map is not None and isinstance(self.expert_map,
220-
torch.Tensor):
219+
if self._expert_map is not None and isinstance(self._expert_map,
220+
torch.Tensor):
221221
logger.info_once(
222222
"[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
223223
" number of experts: %s/%s. Experts local to global index map:"
224224
" %s.", self.ep_rank, self.ep_size, self.local_num_experts,
225225
self.global_num_experts,
226-
get_compressed_expert_map(self.expert_map))
226+
get_compressed_expert_map(self._expert_map))
227227
local_num_experts = (torch.sum(
228-
self.expert_map != -1) if self.expert_map is not None else
228+
self._expert_map != -1) if self._expert_map is not None else
229229
self.global_num_experts)
230230
if self.dynamic_eplb:
231231
self.moe_load = torch.zeros(local_num_experts,
@@ -276,10 +276,16 @@ def _get_quant_type(self) -> QuantType:
276276
return QuantType.NONE
277277

278278
def update_expert_map(self, new_expert_map):
279-
self.expert_map = new_expert_map
279+
self._expert_map = new_expert_map
280280

281-
def get_map(self):
282-
return self.expert_map
281+
@property
282+
def expert_map(self) -> torch.Tensor | None:
283+
return self._expert_map
284+
285+
@expert_map.setter
286+
def expert_map(self, new_expert_map):
287+
# TODO(Potabk): Remove this once we drop vllm v0.12.0(This makes backward compatibility with vllm v0.12.0)
288+
self._expert_map = new_expert_map
283289

284290
def get_log2phy_map(self):
285291
return self.log2phy

vllm_ascend/patch/platform/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,15 @@
1717
import os
1818

1919
import vllm_ascend.patch.platform.patch_distributed # noqa
20-
import vllm_ascend.patch.platform.patch_ec_connector # noqa
2120
import vllm_ascend.patch.platform.patch_mamba_config # noqa
2221
import vllm_ascend.patch.platform.patch_sched_yield # noqa
22+
from vllm_ascend.utils import vllm_version_is
2323

2424
if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv(
2525
"EXPERT_MAP_RECORD", "false") == "true":
2626
import vllm_ascend.patch.platform.patch_multiproc_executor # noqa
27+
28+
if vllm_version_is("0.12.0"):
29+
import vllm_ascend.patch.platform.patch_ec_connector012 # noqa
30+
else:
31+
import vllm_ascend.patch.platform.patch_ec_connector # noqa

vllm_ascend/patch/platform/patch_ec_connector.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
1-
import vllm.distributed.ec_transfer.ec_connector.shared_storage_connector
1+
import vllm.distributed.ec_transfer.ec_connector.example_connector
22
from safetensors.torch import load_file
3-
from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorMetadata
4-
from vllm.distributed.ec_transfer.ec_connector.shared_storage_connector import (
5-
ECSharedStorageConnector, ECSharedStorageConnectorMetadata)
3+
from vllm.distributed.ec_transfer.ec_connector.example_connector import (
4+
ECConnectorMetadata, ECExampleConnector)
65
from vllm.logger import logger
76

87

9-
class AscendECSharedStorageConnector(ECSharedStorageConnector):
8+
class AscendECExampleConnector(ECExampleConnector):
109

1110
def start_load_caches(self, encoder_cache, **kwargs) -> None:
1211
metadata: ECConnectorMetadata = self._get_connector_metadata()
13-
assert isinstance(metadata, ECSharedStorageConnectorMetadata)
12+
assert isinstance(metadata, ECConnectorMetadata)
1413
assert encoder_cache is not None
1514
if metadata is None:
1615
logger.warning((
@@ -29,4 +28,4 @@ def start_load_caches(self, encoder_cache, **kwargs) -> None:
2928
mm_data.mm_hash)
3029

3130

32-
vllm.distributed.ec_transfer.ec_connector.shared_storage_connector.ECSharedStorageConnector = AscendECSharedStorageConnector
31+
vllm.distributed.ec_transfer.ec_connector.example_connector.ECExampleConnector = AscendECExampleConnector

0 commit comments

Comments
 (0)