khairulkabir1661
diff --git a/‎.buildkite/test_areas/expert_parallelism.yaml‎
Lines changed: 1 addition & 2 deletions b/‎.buildkite/test_areas/expert_parallelism.yaml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎vllm/config/parallel.py‎
Lines changed: 31 additions & 85 deletions b/‎vllm/config/parallel.py‎
Lines changed: 31 additions & 85 deletions
diff --git a/‎vllm/distributed/elastic_ep/elastic_execute.py‎
Lines changed: 2 additions & 4 deletions b/‎vllm/distributed/elastic_ep/elastic_execute.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎vllm/distributed/elastic_ep/elastic_state.py‎
Lines changed: 1 addition & 12 deletions b/‎vllm/distributed/elastic_ep/elastic_state.py‎
Lines changed: 1 addition & 12 deletions
diff --git a/‎vllm/distributed/elastic_ep/standby_state.py‎
Lines changed: 15 additions & 9 deletions b/‎vllm/distributed/elastic_ep/standby_state.py‎
Lines changed: 15 additions & 9 deletions
diff --git a/‎vllm/distributed/parallel_state.py‎
Lines changed: 20 additions & 21 deletions b/‎vllm/distributed/parallel_state.py‎
Lines changed: 20 additions & 21 deletions
@@ -24,8 +24,7 @@ steps:
 
 - label: Elastic EP Scaling Test
   timeout_in_minutes: 20
-  device: b200
-  optional: true
+  device: h100
   working_dir: "/vllm-workspace/tests"
   num_devices: 4
   source_file_dependencies:
 
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
+import socket
 from collections.abc import Callable
 from typing import TYPE_CHECKING, Any, Literal, overload
 
@@ -266,33 +267,9 @@ class is dynamically inherited by the worker class. This is used to inject
     Set to be private as it's not intended to be configured by users.
     """
 
-    _stateless_dp_group_port_list: list[list[int]] = Field(default_factory=list)
-    """List of open ports for stateless DP groups when enable_elastic_ep is True.
-    Set to be private as it's not intended to be configured by users.
-    It is a list of list[int], with each inner list contains a set of 3 ports
-    to be used for setting up the stateless CPU/device/TCPStore groups
-    in StatelessGroupCoordinator. The number of inner lists is equal to
-    the number of DP groups, 
-    i.e., len(self._stateless_dp_group_port_list) == world_size_across_dp // dp_size,
-    and len(self._stateless_dp_group_port_list[i]) == 3 for all i.
-    """
-
-    _stateless_ep_group_port_list: list[list[int]] = Field(default_factory=list)
-    """List of open ports for stateless EP groups when enable_elastic_ep is True.
-    Set to be private as it's not intended to be configured by users.
-    len(self._stateless_ep_group_port_list) == world_size_across_dp // ep_size,
-    """
-
-    _stateless_eplb_group_port_list: list[list[int]] = Field(default_factory=list)
-    """List of open ports for stateless EPLB groups when enable_elastic_ep is True.
-    Same topology as EP but separate NCCL communicator to avoid deadlocks.
-    """
-
-    _stateless_world_group_port_list: list[list[int]] = Field(default_factory=list)
-    """List of open ports for stateless world group when enable_elastic_ep is True.
-    Set to be private as it's not intended to be configured by users.
-    len(self._stateless_world_group_port_list) == 1,
-    """
+    _coord_store_port: int = 0
+    """Port of the coordination TCPStore. Can be set by the API server; workers
+    connect as clients to exchange self-picked group ports at runtime."""
 
     decode_context_parallel_size: int = 1
     """Number of decode context parallel groups, because the world size does
@@ -465,65 +442,32 @@ def get_next_dp_init_port(self) -> int:
 
         return answer
 
-    def allocate_elastic_ep_ports(self) -> None:
-        """Allocate all ports for elastic EP (stateless groups + DP master).
+    def _pick_stateless_dp_port(self) -> tuple[int, socket.socket | None]:
+        """Return ``(port, listen_socket)`` for DP group init.
 
-        Must be called AFTER ray.init() so that ports claimed by Ray's
-        idle worker pool are already in use and won't be returned by
-        get_open_ports_list().
+        With a coord store, rank 0 binds a socket and publishes the port;
+        others read it.  Without one, pops a pre-allocated port and
+        returns ``listen_socket=None``.
         """
-        if not self.enable_elastic_ep:
-            return
-        if self._stateless_world_group_port_list:
-            return
-
-        num_world_groups = 1
-        dp_size = self.data_parallel_size
-        ep_size = self.data_parallel_size * self.world_size_across_dp
-        num_dp_groups = max(1, self.world_size_across_dp // dp_size)
-        num_ep_groups = max(1, self.world_size_across_dp // ep_size)
-        num_eplb_groups = num_ep_groups
-        total_stateless_ports = (
-            num_world_groups + num_dp_groups + num_ep_groups + num_eplb_groups
-        ) * 3
-        num_dp_master_ports = 5
-
-        all_ports = get_open_ports_list(total_stateless_ports + num_dp_master_ports)
-
-        self._data_parallel_master_port_list = all_ports[-num_dp_master_ports:]
-        self.data_parallel_master_port = self._data_parallel_master_port_list.pop()
-        all_ports = all_ports[:-num_dp_master_ports]
-
-        self._stateless_world_group_port_list = [
-            all_ports[i : i + 3] for i in range(0, num_world_groups * 3, 3)
-        ]
-        start_idx = num_world_groups * 3
-        self._stateless_dp_group_port_list = [
-            all_ports[i : i + 3]
-            for i in range(start_idx, start_idx + num_dp_groups * 3, 3)
-        ]
-        start_idx += num_dp_groups * 3
-        self._stateless_ep_group_port_list = [
-            all_ports[i : i + 3]
-            for i in range(start_idx, start_idx + num_ep_groups * 3, 3)
-        ]
-        start_idx += num_ep_groups * 3
-        self._stateless_eplb_group_port_list = [
-            all_ports[i : i + 3]
-            for i in range(start_idx, start_idx + num_eplb_groups * 3, 3)
-        ]
-
-    def get_next_stateless_world_group_port(self) -> list[int]:
-        return self._stateless_world_group_port_list.pop()
-
-    def get_next_stateless_dp_group_port(self) -> list[int]:
-        return self._stateless_dp_group_port_list.pop()
-
-    def get_next_stateless_ep_group_port(self) -> list[int]:
-        return self._stateless_ep_group_port_list.pop()
-
-    def get_next_stateless_eplb_group_port(self) -> list[int]:
-        return self._stateless_eplb_group_port_list.pop()
+        if not self._coord_store_port:
+            return self.get_next_dp_init_port(), None
+
+        from vllm.distributed.utils import get_cached_tcp_store_client
+
+        store = get_cached_tcp_store_client(
+            self.data_parallel_master_ip, self._coord_store_port
+        )
+
+        key = "dp_master_port"
+        if self.data_parallel_rank == 0:
+            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            s.bind((self.data_parallel_master_ip, 0))
+            s.listen()
+            port = s.getsockname()[1]
+            store.set(key, str(port).encode())
+            return port, s
+        else:
+            return int(store.get(key).decode()), None
 
     @overload
     def stateless_init_dp_group(
@@ -553,14 +497,16 @@ def stateless_init_dp_group(
         last_exc: Exception | None = None
         for _ in range(max_retries):
             try:
+                port, listen_socket = self._pick_stateless_dp_port()
                 # use gloo since the engine process might not have cuda device
                 return stateless_init_torch_distributed_process_group(
                     self.data_parallel_master_ip,
-                    self.get_next_dp_init_port(),
+                    port,
                     self.data_parallel_rank,
                     self.data_parallel_size,
                     backend="gloo",
                     return_store=return_store,
+                    listen_socket=listen_socket,
                 )
             except DistNetworkError as e:
                 # We only want to retry when the root cause is EADDRINUSE.
 
@@ -162,10 +162,8 @@ def create_standby_groups(
                 new_dp_size=new_dp_size,
                 new_world_size_across_dp=new_world_size_across_dp,
                 master_ip=reconfig_request.new_data_parallel_master_ip,
-                world_group_ports=reconfig_request.new_stateless_world_group_port_list,
-                dp_group_ports=reconfig_request.new_stateless_dp_group_port_list,
-                ep_group_ports=reconfig_request.new_stateless_ep_group_port_list,
-                eplb_group_ports=reconfig_request.new_stateless_eplb_group_port_list,
+                coord_store_port=reconfig_request.coord_store_port,
+                enable_eplb=updated_config.parallel_config.enable_eplb,
             )
         self.worker.model_runner.eep_eplb_suppressed = True
         standby_ep_group = get_standby_ep_group()
 
@@ -563,15 +563,4 @@ def _update_parallel_config(self):
         parallel_config._data_parallel_master_port_list = (
             reconfig_request.new_data_parallel_master_port_list
         )
-        parallel_config._stateless_world_group_port_list = (
-            reconfig_request.new_stateless_world_group_port_list
-        )
-        parallel_config._stateless_dp_group_port_list = (
-            reconfig_request.new_stateless_dp_group_port_list
-        )
-        parallel_config._stateless_ep_group_port_list = (
-            reconfig_request.new_stateless_ep_group_port_list
-        )
-        parallel_config._stateless_eplb_group_port_list = (
-            reconfig_request.new_stateless_eplb_group_port_list
-        )
+        parallel_config._coord_store_port = reconfig_request.coord_store_port
@@ -38,10 +38,8 @@ def create_standby_groups(
     new_dp_size: int,
     new_world_size_across_dp: int,
     master_ip: str,
-    world_group_ports: list[list[int]],
-    dp_group_ports: list[list[int]],
-    ep_group_ports: list[list[int]],
-    eplb_group_ports: list[list[int]] | None = None,
+    coord_store_port: int,
+    enable_eplb: bool = True,
     backend: str | None = None,
 ) -> None:
     global \
@@ -51,19 +49,23 @@ def create_standby_groups(
         _STANDBY_EP, \
         _STANDBY_EPLB
 
+    from vllm.distributed.utils import get_cached_tcp_store_client
+
     assert new_world_size_across_dp == torch.distributed.get_world_size() * new_dp_size
     world_group = get_world_group()
     assert isinstance(world_group, StatelessGroupCoordinator)
     backend = backend or world_group.backend
 
+    coord_store = get_cached_tcp_store_client(master_ip, coord_store_port)
+
     standby_world_ranks = [list(range(new_world_size_across_dp))]
     _STANDBY_WORLD = _init_stateless_group(
         standby_world_ranks,
         "world",
-        world_group_ports,
         master_ip,
         backend,
         use_device_communicator=False,
+        coord_store=coord_store,
     )
     _STANDBY_WORLD_NODE_COUNT = _node_count(_STANDBY_WORLD.tcp_store_group)
 
@@ -76,20 +78,24 @@ def create_standby_groups(
     standby_dp_ranks = all_ranks.transpose(1, 3).reshape(-1, new_dp_size).unbind(0)
     standby_dp_ranks = [x.tolist() for x in standby_dp_ranks]
     _STANDBY_DP = _init_stateless_group(
-        standby_dp_ranks, "dp", dp_group_ports, master_ip, backend
+        standby_dp_ranks, "dp", master_ip, backend, coord_store=coord_store
     )
 
     standby_ep_ranks = (
         all_ranks.transpose(1, 2).reshape(-1, new_dp_size * tp_size).unbind(0)
     )
     standby_ep_ranks = [x.tolist() for x in standby_ep_ranks]
     _STANDBY_EP = _init_stateless_group(
-        standby_ep_ranks, "ep", ep_group_ports, master_ip, backend
+        standby_ep_ranks, "ep", master_ip, backend, coord_store=coord_store
     )
 
-    if eplb_group_ports is not None:
+    if enable_eplb:
         _STANDBY_EPLB = _init_stateless_group(
-            standby_ep_ranks, "eplb", eplb_group_ports, master_ip, backend
+            standby_ep_ranks,
+            "eplb",
+            master_ip,
+            backend,
+            coord_store=coord_store,
         )
 
 
 
@@ -40,13 +40,16 @@
 import torch.distributed
 import torch.distributed._functional_collectives as funcol
 import torch.distributed._symmetric_memory
-from torch.distributed import Backend, ProcessGroup
+from torch.distributed import Backend, ProcessGroup, Store
 
 import vllm.envs as envs
 from vllm.distributed.device_communicators.base_device_communicator import (
     DeviceCommunicatorBase,
 )
-from vllm.distributed.utils import StatelessProcessGroup
+from vllm.distributed.utils import (
+    StatelessProcessGroup,
+    get_cached_tcp_store_client,
+)
 from vllm.logger import init_logger
 from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.network_utils import get_distributed_init_method
@@ -1164,9 +1167,9 @@ def init_model_parallel_group(
 def _init_stateless_group(
     group_ranks: list[list[int]],
     group_name: str,
-    group_ports: list[list[int]],
     host: str,
     backend: str,
+    coord_store: Store,
     use_device_communicator: bool = True,
 ) -> "StatelessGroupCoordinator":
     """Create a StatelessGroupCoordinator with the given parameters."""
@@ -1180,7 +1183,7 @@ def _init_stateless_group(
         use_device_communicator=use_device_communicator,
         group_name=group_name,
         host=host,
-        group_ports=group_ports,
+        coord_store=coord_store,
         global_rank=world.rank,
         global_world_size=world.world_size,
     )
@@ -1321,15 +1324,17 @@ def _init_elastic_ep_world(
     group_ranks = [all_ranks[i : i + 1] for i in range(global_world_size)]
     if global_rank in all_ranks:
         group_ranks = [all_ranks]
-    group_ports = [parallel_config.get_next_stateless_world_group_port()]
+    coord_store = get_cached_tcp_store_client(
+        parallel_config.data_parallel_master_ip, parallel_config._coord_store_port
+    )
     world = StatelessGroupCoordinator(
         group_ranks=group_ranks,
         local_rank=local_rank,
         torch_distributed_backend=backend,
         use_device_communicator=False,
         group_name="world",
         host=parallel_config.data_parallel_master_ip,
-        group_ports=group_ports,
+        coord_store=coord_store,
         global_rank=global_rank,
         global_world_size=global_world_size,
     )
@@ -1513,7 +1518,13 @@ def initialize_model_parallel(
     config = get_current_vllm_config()
     data_parallel_size = config.parallel_config.data_parallel_size
     enable_elastic_ep = config.parallel_config.enable_elastic_ep
+    parallel_config = config.parallel_config
+    coord_store: Store | None = None
     if enable_elastic_ep:
+        coord_store = get_cached_tcp_store_client(
+            parallel_config.data_parallel_master_ip,
+            parallel_config._coord_store_port,
+        )
         # Use stateless world group for global information
         world_size = get_world_group().world_size
         rank = get_world_group().rank
@@ -1633,16 +1644,12 @@ def initialize_model_parallel(
     group_ranks = all_ranks.transpose(1, 4).reshape(-1, data_parallel_size).unbind(0)
     group_ranks = [x.tolist() for x in group_ranks]
     if enable_elastic_ep:
-        parallel_config = config.parallel_config
-        dp_ports = [
-            parallel_config.get_next_stateless_dp_group_port() for _ in group_ranks
-        ]
         _DP = _init_stateless_group(
             group_ranks,
             "dp",
-            dp_ports,
             parallel_config.data_parallel_master_ip,
             backend,
+            coord_store=coord_store,
         )
     else:
         _DP = init_model_parallel_group(
@@ -1665,16 +1672,12 @@ def initialize_model_parallel(
         )
         group_ranks = [x.tolist() for x in group_ranks]
         if enable_elastic_ep:
-            parallel_config = config.parallel_config
-            ep_ports = [
-                parallel_config.get_next_stateless_ep_group_port() for _ in group_ranks
-            ]
             _EP = _init_stateless_group(
                 group_ranks,
                 "ep",
-                ep_ports,
                 parallel_config.data_parallel_master_ip,
                 backend,
+                coord_store=coord_store,
             )
         else:
             _EP = init_model_parallel_group(
@@ -1693,16 +1696,12 @@ def initialize_model_parallel(
             and config.parallel_config.enable_eplb
         ):
             if enable_elastic_ep:
-                eplb_ports = [
-                    parallel_config.get_next_stateless_eplb_group_port()
-                    for _ in group_ranks
-                ]
                 _EPLB = _init_stateless_group(
                     group_ranks,
                     "eplb",
-                    eplb_ports,
                     parallel_config.data_parallel_master_ip,
                     backend,
+                    coord_store=coord_store,
                 )
             else:
                 _EPLB = init_model_parallel_group(