main2main

Potabk · Potabk · commit 751e9a7683cd · 2025-12-14T22:11:12.000+08:00
Signed-off-by: wangli &lt;wangli858794774@gmail.com&gt;
diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml
@@ -74,7 +74,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9, v0.12.0]
+        vllm_version: [b75f826fca4febb17a76c12a45d5e315111c7618, v0.12.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
     uses: ./.github/workflows/_e2e_test.yaml
diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml
@@ -42,7 +42,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9
+      vllm: b75f826fca4febb17a76c12a45d5e315111c7618
   changes:
     runs-on: linux-aarch64-a2-0
     outputs:
@@ -90,7 +90,7 @@ jobs:
         SOC_VERSION: ascend910b1
     strategy:
       matrix:
-        vllm_version: [ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9, v0.12.0]
+        vllm_version: [b75f826fca4febb17a76c12a45d5e315111c7618, v0.12.0]
 
     steps:
       - name: Free up disk space
@@ -154,7 +154,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9, v0.12.0]
+        vllm_version: [b75f826fca4febb17a76c12a45d5e315111c7618, v0.12.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.
diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md
@@ -45,7 +45,7 @@ The table below is the release compatibility matrix for vLLM Ascend release.
 For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  |
 |-------------|--------------|------------------|-------------|--------------------|
-|     main    | ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9,  v0.12.0 tag | >= 3.10, < 3.12   | 8.3.RC2 | 2.8.0 / 2.8.0 |
+|     main    | b75f826fca4febb17a76c12a45d5e315111c7618,  v0.12.0 tag | >= 3.10, < 3.12   | 8.3.RC2 | 2.8.0 / 2.8.0 |
 
 ## Release cadence
 
diff --git a/tests/e2e/multicard/test_shared_expert_dp.py b/tests/e2e/multicard/test_shared_expert_dp.py
@@ -8,6 +8,7 @@
 
 MODELS = [
     "deepseek-ai/DeepSeek-V2-Lite",
+    "deepseek-ai/DeepSeek-V2-Lite",
 ]
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
diff --git a/tests/ut/compilation/test_acl_graph.py b/tests/ut/compilation/test_acl_graph.py
@@ -803,7 +803,9 @@ def test_update_mla_dcp_pcp_params(self, _mock_graph_task_end):
             (q_nope, q_pe, k_nope, k_pe, block_table, seq_lens, num_heads,
              scale, num_kv_heads, out, lse))
 
-        update_mla_attn_dcp_pcp_params(self.update_stream, forward_context, 4)
+        with patch("torch_npu._C._npu_setStream", return_value=None):
+            update_mla_attn_dcp_pcp_params(self.update_stream, forward_context,
+                                           4)
 
         _mock_graph_task_end.assert_called_once()
 
@@ -842,6 +844,7 @@ def test_update_attn_dcp_pcp_params(self, _mock_graph_task_end):
              block_table, 128, actual_seq_lengths_kv, actual_seq_lengths_q,
              out, lse, 2, 0, 0))
 
-        update_attn_dcp_pcp_params(self.update_stream, forward_context, 4)
+        with patch("torch_npu._C._npu_setStream", return_value=None):
+            update_attn_dcp_pcp_params(self.update_stream, forward_context, 4)
 
         _mock_graph_task_end.assert_called_once()
diff --git a/tests/ut/spec_decode/test_eagle_proposer.py b/tests/ut/spec_decode/test_eagle_proposer.py
@@ -95,6 +95,8 @@ def test_load_model_pp1(self, mock_pp_group, mock_get_model,
         mock_model = MagicMock()
         mock_model.model.embed_tokens = MagicMock()
         mock_model.lm_head = MagicMock()
+        mock_model.multimodal_cpu_fields = None
+        mock_model.merge_by_field_config = None
         mock_get_model.return_value = MagicMock()
         self.proposer.name = SpecDcodeType.EAGLE
 
@@ -117,6 +119,8 @@ def test_load_model_pp_gt1(self, mock_pp_group, mock_get_model,
 
         mock_model = MagicMock()
         original_embed = MagicMock()
+        mock_model.multimodal_cpu_fields = None
+        mock_model.merge_by_field_config = None
         mock_get_model.return_value = MagicMock(model=MagicMock(
             embed_tokens=original_embed))
 
diff --git a/vllm_ascend/eplb/utils.py b/vllm_ascend/eplb/utils.py
@@ -24,7 +24,7 @@
 
 
 def get_expert_map(self, layer_id):
-    return self.model.layers[layer_id].mlp.experts.get_map()
+    return self.model.layers[layer_id].mlp.experts.expert_map
 
 
 def get_log2phy_map(self, layer_id):
diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -153,7 +153,7 @@ def __init__(self, *args, **kwargs):
         AscendFusedMoE.moe_counter += 1
         self.moe_instance_id = AscendFusedMoE.moe_counter
 
-        self.expert_map = None
+        self._expert_map = None
         self.log2phy = None
 
         if self.quant_config is None:
@@ -184,7 +184,7 @@ def __init__(self, *args, **kwargs):
                 dtype=vllm_config.model_config.dtype)
 
         # init moe.
-        self.local_num_experts, self.expert_map, _ = determine_expert_map(
+        self.local_num_experts, self._expert_map, _ = determine_expert_map(
             self.ep_size, self.ep_rank, self.global_num_experts)
         # TODO: Temporary flag to indicate if static EPLB is enabled. This is a
         # workaround to bypass a quantization check that fails with float weights.
@@ -200,7 +200,7 @@ def __init__(self, *args, **kwargs):
                 self.expert_load_balancer.get_global_redundant_expert_num())
             self.global_num_experts = num_experts + self.global_redundant_expert_num
             try:
-                self.local_num_experts, self.expert_map = (
+                self.local_num_experts, self._expert_map = (
                     self.expert_load_balancer.get_rank_placement_map(
                         self.moe_instance_id, self.ep_rank))
                 self.log2phy = self.expert_load_balancer.get_rank_log2phy_map(
@@ -216,16 +216,16 @@ def __init__(self, *args, **kwargs):
             if self.dynamic_eplb:
                 self.log2phy = determine_default_log2phy_map(
                     self.global_num_experts, self.ep_size, self.ep_rank).npu()
-        if self.expert_map is not None and isinstance(self.expert_map,
-                                                      torch.Tensor):
+        if self._expert_map is not None and isinstance(self._expert_map,
+                                                       torch.Tensor):
             logger.info_once(
                 "[EP Rank %s/%s] Expert parallelism is enabled. Local/global"
                 " number of experts: %s/%s. Experts local to global index map:"
                 " %s.", self.ep_rank, self.ep_size, self.local_num_experts,
                 self.global_num_experts,
-                get_compressed_expert_map(self.expert_map))
+                get_compressed_expert_map(self._expert_map))
         local_num_experts = (torch.sum(
-            self.expert_map != -1) if self.expert_map is not None else
+            self._expert_map != -1) if self._expert_map is not None else
                              self.global_num_experts)
         if self.dynamic_eplb:
             self.moe_load = torch.zeros(local_num_experts,
@@ -276,10 +276,16 @@ def _get_quant_type(self) -> QuantType:
             return QuantType.NONE
 
     def update_expert_map(self, new_expert_map):
-        self.expert_map = new_expert_map
+        self._expert_map = new_expert_map
 
-    def get_map(self):
-        return self.expert_map
+    @property
+    def expert_map(self) -> torch.Tensor | None:
+        return self._expert_map
+
+    @expert_map.setter
+    def expert_map(self, new_expert_map):
+        # TODO(Potabk): Remove this once we drop vllm v0.12.0(This makes backward compatibility with vllm v0.12.0)
+        self._expert_map = new_expert_map
 
     def get_log2phy_map(self):
         return self.log2phy
diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py
@@ -17,10 +17,15 @@
 import os
 
 import vllm_ascend.patch.platform.patch_distributed  # noqa
-import vllm_ascend.patch.platform.patch_ec_connector  # noqa
 import vllm_ascend.patch.platform.patch_mamba_config  # noqa
 import vllm_ascend.patch.platform.patch_sched_yield  # noqa
+from vllm_ascend.utils import vllm_version_is
 
 if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv(
         "EXPERT_MAP_RECORD", "false") == "true":
     import vllm_ascend.patch.platform.patch_multiproc_executor  # noqa
+
+if vllm_version_is("0.12.0"):
+    import vllm_ascend.patch.platform.patch_ec_connector012  # noqa
+else:
+    import vllm_ascend.patch.platform.patch_ec_connector  # noqa
diff --git a/vllm_ascend/patch/platform/patch_ec_connector.py b/vllm_ascend/patch/platform/patch_ec_connector.py
@@ -1,16 +1,15 @@
-import vllm.distributed.ec_transfer.ec_connector.shared_storage_connector
+import vllm.distributed.ec_transfer.ec_connector.example_connector
 from safetensors.torch import load_file
-from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorMetadata
-from vllm.distributed.ec_transfer.ec_connector.shared_storage_connector import (
-    ECSharedStorageConnector, ECSharedStorageConnectorMetadata)
+from vllm.distributed.ec_transfer.ec_connector.example_connector import (
+    ECConnectorMetadata, ECExampleConnector)
 from vllm.logger import logger
 
 
-class AscendECSharedStorageConnector(ECSharedStorageConnector):
+class AscendECExampleConnector(ECExampleConnector):
 
     def start_load_caches(self, encoder_cache, **kwargs) -> None:
         metadata: ECConnectorMetadata = self._get_connector_metadata()
-        assert isinstance(metadata, ECSharedStorageConnectorMetadata)
+        assert isinstance(metadata, ECConnectorMetadata)
         assert encoder_cache is not None
         if metadata is None:
             logger.warning((
@@ -29,4 +28,4 @@ def start_load_caches(self, encoder_cache, **kwargs) -> None:
                          mm_data.mm_hash)
 
 
-vllm.distributed.ec_transfer.ec_connector.shared_storage_connector.ECSharedStorageConnector = AscendECSharedStorageConnector
+vllm.distributed.ec_transfer.ec_connector.example_connector.ECExampleConnector = AscendECExampleConnector
diff --git a/vllm_ascend/patch/platform/patch_ec_connector012.py b/vllm_ascend/patch/platform/patch_ec_connector012.py
@@ -0,0 +1,33 @@
+import vllm.distributed.ec_transfer.ec_connector.shared_storage_connector  # type: ignore[import-not-found]  # noqa
+from safetensors.torch import load_file
+from vllm.distributed.ec_transfer.ec_connector.base import \
+    ECConnectorMetadata  # type: ignore[import-not-found]  # noqa
+from vllm.distributed.ec_transfer.ec_connector.shared_storage_connector import (  # type: ignore[import-not-found]  # noqa
+    ECSharedStorageConnector, ECSharedStorageConnectorMetadata)
+from vllm.logger import logger
+
+
+class AscendECSharedStorageConnector(ECSharedStorageConnector):
+
+    def start_load_caches(self, encoder_cache, **kwargs) -> None:
+        metadata: ECConnectorMetadata = self._get_connector_metadata()
+        assert isinstance(metadata, ECSharedStorageConnectorMetadata)
+        assert encoder_cache is not None
+        if metadata is None:
+            logger.warning((
+                "In connector.start_load_caches, ",
+                "but the connector metadata is None",
+            ))
+            return
+        # Load the EC for each mm data
+        for mm_data in metadata.mm_datas:
+            if mm_data.mm_hash in encoder_cache:
+                continue
+            filename = self._generate_filename_debug(mm_data.mm_hash)
+            ec_cache = load_file(filename)["ec_cache"].npu()
+            encoder_cache[mm_data.mm_hash] = ec_cache
+            logger.debug("Success load encoder cache for hash %s",
+                         mm_data.mm_hash)
+
+
+vllm.distributed.ec_transfer.ec_connector.shared_storage_connector.ECSharedStorageConnector = AscendECSharedStorageConnector
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -365,6 +365,10 @@ def get_attn_backend_cls(
         use_mla,
         has_sink=False,
         use_sparse=False,
+        # NOTE: Please pay special attention to the order of these parameters.
+        # Although we are only using some of them so far
+        # vllm passes them in sequence when using this interface.
+        use_mm_prefix: bool = False,
         attn_type: str | None = None,
     ):
         # choose attention backend based on use_mla
diff --git a/vllm_ascend/pool/__init__.py b/vllm_ascend/pool/__init__.py
diff --git a/vllm_ascend/pool/medatata.py b/vllm_ascend/pool/medatata.py
@@ -0,0 +1,11 @@
+import torch
+
+
+class PoolingStates:
+    # NOTE: This should be removed after we drop support of vLLM v0.12.0
+    def __init__(self):
+        # for chunked prefill with ALL pooling
+        self.hidden_states_cache: list[torch.Tensor] = []
+
+    def clean(self):
+        self.hidden_states_cache.clear()
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -475,9 +475,10 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
 
     # Calculate maximum supported batch sizes considering model architecture
     resources_per_graph = num_hidden_layers + 1
-    if vllm_config.speculative_config is not None:
-        draft_model_hf_config = vllm_config.speculative_config.draft_model_config.hf_config
-        resources_per_graph += draft_model_hf_config.num_hidden_layers + 1
+    # For suffix decoding, use the suffix path when no draft_model_config is provided.
+    if (spec := vllm_config.speculative_config) and \
+    (draft := spec.draft_model_config):
+        resources_per_graph += draft.hf_config.num_hidden_layers + 1
 
     # TODO: Find out whether we need to take into account the pp_size
     num_comm_groups = sum(size > 1 for size in [
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -121,7 +121,7 @@
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
                                AscendDeviceType, ProfileExecuteDuration,
                                enable_sp, get_ascend_device_type, is_enable_nz,
-                               is_moe_model, lmhead_tp_enable)
+                               is_moe_model, lmhead_tp_enable, vllm_version_is)
 from vllm_ascend.worker.npu_input_batch import InputBatch
 
 if TYPE_CHECKING:
@@ -249,13 +249,24 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         # Set up Attention
         self.use_sparse = hasattr(self.vllm_config.model_config.hf_config,
                                   "index_topk")
-        self.attn_backend = get_attn_backend(0,
-                                             self.dtype,
-                                             None,
-                                             self.block_size,
-                                             use_mla=self.model_config.use_mla,
-                                             use_sparse=self.use_sparse)
-
+        if vllm_version_is('0.12.0'):
+            self.attn_backend = get_attn_backend(
+                0,
+                self.dtype,
+                None,
+                self.block_size,
+                use_mla=self.model_config.use_mla,
+                use_sparse=self.use_sparse)
+        else:
+            self.attn_backend = get_attn_backend(
+                0,
+                self.dtype,
+                None,
+                self.block_size,
+                use_mla=self.model_config.use_mla,
+                use_sparse=self.use_sparse,
+                use_mm_prefix=self.model_config is not None
+                and self.model_config.is_mm_prefix_lm)
         self.attn_mask_builder = AttentionMaskBuilder(self.device)
 
         self._set_up_drafter()
diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py

Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,7 @@`
`8`	`8`
`9`	`9`	`MODELS = [`
`10`	`10`	`"deepseek-ai/DeepSeek-V2-Lite",`
	`11`	`+ "deepseek-ai/DeepSeek-V2-Lite",`
`11`	`12`	`]`
`12`	`13`	`os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"`
`13`	`14`