Implement RESTART_GANG runtime failure policy

jeffreywang-anyscale · jeffreywang-anyscale · commit 55da6ccd8add · 2026-02-06T07:39:30.000Z
Signed-off-by: jeffreywang &lt;jeffreywang@anyscale.com&gt;
diff --git a/python/ray/serve/_private/common.py b/python/ray/serve/_private/common.py
@@ -879,11 +879,7 @@ class GangContext:
 
 @dataclass
 class GangPlacementGroupRequest:
-    """Request to prepare gang placement groups for a deployment.
-
-    Used in Step 3.5 of DeploymentStateManager.update() to pre-create
-    placement groups before replicas are created in Step 4.
-    """
+    """Request to prepare gang placement groups for a deployment."""
 
     deployment_id: DeploymentID
     gang_size: int
@@ -894,10 +890,7 @@ class GangPlacementGroupRequest:
 
 @dataclass
 class GangPreparationResult:
-    """Result of gang placement group preparation.
-
-    Contains either successfully created placement groups or error information.
-    """
+    """Result of gang placement group preparation."""
 
     success: bool
     error_message: Optional[str] = None
diff --git a/python/ray/serve/_private/deployment_scheduler.py b/python/ray/serve/_private/deployment_scheduler.py
@@ -1,6 +1,7 @@
 import copy
 import logging
 import sys
+import uuid
 import warnings
 from abc import ABC, abstractmethod
 from collections import defaultdict
@@ -1028,7 +1029,8 @@ def _prepare_gangs_for_deployment(
             ]
 
             pg_name = (
-                f"gang_{deployment_id.app_name}_{deployment_id.name}_{gang_index}"
+                f"gang_{deployment_id.app_name}_{deployment_id.name}"
+                f"_{gang_index}_{uuid.uuid4().hex[:8]}"
             )
             strategy = request.gang_placement_strategy
 
diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py
@@ -37,6 +37,7 @@
     RunningReplicaInfo,
 )
 from ray.serve._private.config import DeploymentConfig
+from ray.serve.config import GangRuntimeFailurePolicy
 from ray.serve._private.constants import (
     DEFAULT_LATENCY_BUCKET_MS,
     MAX_PER_REPLICA_RETRY_COUNT,
@@ -999,7 +1000,13 @@ def check_stopped(self) -> bool:
             # Remove the placement group both if the actor has already been deleted or
             # it was just killed above.
             if stopped and self._placement_group is not None:
-                ray.util.remove_placement_group(self._placement_group)
+                try:
+                    ray.util.remove_placement_group(self._placement_group)
+                except Exception:
+                    # Gang PGs are shared across multiple replicas.
+                    # Another replica in the same gang may have already
+                    # removed this PG.
+                    pass
 
         return stopped
 
@@ -3043,13 +3050,18 @@ def _add_replicas_with_gang_scheduling(
         """
         upscale = []
 
-        # Check if gang PG preparation failed
-        if gang_prep_result is None or not gang_prep_result.success:
-            error_msg = (
-                gang_prep_result.error_message
-                if gang_prep_result
-                else "Gang placement groups were not prepared"
+        # PG prep was not attempted (e.g. replicas still stopping).
+        # Skip replica creation and retry in the next reconciliation loop.
+        if gang_prep_result is None:
+            logger.info(
+                f"Gang PG preparation was skipped for {self._id}. "
+                "Will retry in the next reconciliation loop."
             )
+            return upscale
+
+        # PG prep was attempted but failed (resources insufficient).
+        if not gang_prep_result.success:
+            error_msg = gang_prep_result.error_message or "Unknown error"
             logger.error(
                 f"Gang scheduling failed for {self._id}: {error_msg}. "
                 "Skipping replica creation."
@@ -3357,7 +3369,22 @@ def check_and_update_replicas(self):
         transition happened.
         """
 
-        # TODO (jeffreywang): Implement gang health check and runtime failure policy here.
+        gang_config = self.get_gang_config()
+        restart_gang = (
+            gang_config is not None
+            and gang_config.runtime_failure_policy
+            == GangRuntimeFailurePolicy.RESTART_GANG
+        )
+
+        # --- Gang health check: two-pass approach ---
+        # Pass 1: Check health of all replicas. Collect which gang_ids
+        #         have at least one unhealthy member.
+        # Pass 2: Process results. Healthy replicas whose gang has an
+        #         unhealthy member are forcefully stopped too.
+        healthy_replicas: List[DeploymentReplica] = []
+        unhealthy_replicas: List[DeploymentReplica] = []
+        gang_ids_to_restart: Set[str] = set()
+
         for replica in self._replicas.pop(
             states=[ReplicaState.RUNNING, ReplicaState.PENDING_MIGRATION]
         ):
@@ -3375,6 +3402,38 @@ def check_and_update_replicas(self):
                 self.health_check_failures_counter.inc(tags=metric_tags)
 
             if is_healthy:
+                healthy_replicas.append(replica)
+            else:
+                unhealthy_replicas.append(replica)
+                if restart_gang and replica.gang_context is not None:
+                    gang_ids_to_restart.add(replica.gang_context.gang_id)
+
+        # Pass 2: process healthy replicas.
+        for replica in healthy_replicas:
+            if (
+                restart_gang
+                and replica.gang_context is not None
+                and replica.gang_context.gang_id in gang_ids_to_restart
+            ):
+                # Healthy replica whose gang has an unhealthy member.
+                # Forcefully stop it so the entire gang is rescheduled.
+                logger.warning(
+                    f"Replica {replica.replica_id} is healthy but its gang "
+                    f"(gang_id={replica.gang_context.gang_id}) has an "
+                    "unhealthy replica. Forcefully stopping it because  "
+                    "RESTART_GANG runtime failure policy is enabled."
+                )
+                self._stop_replica(replica, graceful_stop=False)
+                if replica.version == self._target_state.version:
+                    self._curr_status_info = (
+                        self._curr_status_info.handle_transition(
+                            trigger=DeploymentStatusInternalTrigger.HEALTH_CHECK_FAILED,
+                            message="A replica's health check failed. This "
+                            "deployment will be UNHEALTHY until the replica "
+                            "recovers or a new deploy happens.",
+                        )
+                    )
+            else:
                 self._replicas.add(replica.actor_details.state, replica)
                 self.health_check_gauge.set(
                     1,
@@ -3384,29 +3443,33 @@ def check_and_update_replicas(self):
                 )
                 routing_stats = replica.pull_routing_stats()
                 replica.record_routing_stats(routing_stats)
-            else:
-                logger.warning(
-                    f"Replica {replica.replica_id} failed health check, stopping it."
-                )
-                self.health_check_gauge.set(
-                    0,
-                    tags={
-                        "replica": replica.replica_id.unique_id,
-                    },
-                )
-                self._stop_replica(
-                    replica, graceful_stop=not self.FORCE_STOP_UNHEALTHY_REPLICAS
+
+        # Process unhealthy replicas with force-stop for gang replicas under
+        # RESTART_GANG policy.
+        for replica in unhealthy_replicas:
+            logger.warning(
+                f"Replica {replica.replica_id} failed health check, stopping it."
+            )
+            self.health_check_gauge.set(
+                0,
+                tags={
+                    "replica": replica.replica_id.unique_id,
+                },
+            )
+            graceful = not self.FORCE_STOP_UNHEALTHY_REPLICAS
+            if restart_gang and replica.gang_context is not None:
+                graceful = False
+            self._stop_replica(replica, graceful_stop=graceful)
+            # If this is a replica of the target version, the deployment
+            # enters the "UNHEALTHY" status until the replica is
+            # recovered or a new deploy happens.
+            if replica.version == self._target_state.version:
+                self._curr_status_info = self._curr_status_info.handle_transition(
+                    trigger=DeploymentStatusInternalTrigger.HEALTH_CHECK_FAILED,
+                    message="A replica's health check failed. This "
+                    "deployment will be UNHEALTHY until the replica "
+                    "recovers or a new deploy happens.",
                 )
-                # If this is a replica of the target version, the deployment
-                # enters the "UNHEALTHY" status until the replica is
-                # recovered or a new deploy happens.
-                if replica.version == self._target_state.version:
-                    self._curr_status_info = self._curr_status_info.handle_transition(
-                        trigger=DeploymentStatusInternalTrigger.HEALTH_CHECK_FAILED,
-                        message="A replica's health check failed. This "
-                        "deployment will be UNHEALTHY until the replica "
-                        "recovers or a new deploy happens.",
-                    )
 
         slow_start_replicas = []
         slow_start = self._check_startup_replicas(ReplicaState.STARTING)
@@ -4316,6 +4379,14 @@ def _prepare_gang_placement_groups(
             if deployment_state._terminally_failed():
                 continue
 
+            # Skip if deployment has replicas still stopping. Their resources
+            # haven't been released yet, so PG creation would likely fail or
+            # block waiting for resources. We'll retry next reconciliation loop.
+            if deployment_state._replicas.count(
+                states=[ReplicaState.STOPPING]
+            ) > 0:
+                continue
+
             gang_requests[deployment_id] = GangPlacementGroupRequest(
                 deployment_id=deployment_id,
                 gang_size=gang_config.gang_size,
diff --git a/python/ray/serve/config.py b/python/ray/serve/config.py
@@ -797,37 +797,17 @@ class GangRuntimeFailurePolicy(str, Enum):
     """Policy for handling runtime failures of replicas in a gang."""
 
     RESTART_GANG = "RESTART_GANG"
-    """Kill and restart entire gang atomically when any replica fails.
-    Use for: Tightly coupled systems where partial gang is useless.
-    Ensures consistency but higher recovery time."""
+    """Tear down and restart entire gang atomically when any replica fails."""
 
     RESTART_REPLICA = "RESTART_REPLICA"
-    """Kill and restart individual replica when it fails.
-    Use for: Systems that can tolerate partial gang availability.
-    Faster recovery but may result in inconsistent state."""
+    """Tear down and restart individual replica when it fails. Other replicas in the gang will continue running."""
 
 
 @PublicAPI(stability="alpha")
 class GangSchedulingConfig(BaseModel):
-    """Configuration for gang scheduling of deployment replicas.
-    Gang scheduling ensures that groups of replicas are scheduled together
-    atomically, which is essential for distributed workloads that require
-    coordination between replicas.
-    Example:
-        .. code-block:: python
-            from ray import serve
-            from ray.serve.config import GangSchedulingConfig, GangPlacementStrategy
-            @serve.deployment(
-                num_replicas=8,
-                gang_scheduling_config=GangSchedulingConfig(
-                    gang_size=4,
-                    gang_placement_strategy=GangPlacementStrategy.STRICT_PACK,
-                    runtime_failure_policy=GangRuntimeFailurePolicy.RESTART_GANG
-                )
-            )
-            class MyDeployment:
-                pass
-    """
+    """Configuration for gang scheduling of deployment replicas."""
+
+    # Please keep these options in sync with those in `src/ray/protobuf/serve.proto`.
 
     gang_size: int = Field(
         description=(
@@ -854,3 +834,11 @@ class MyDeployment:
             "RESTART_REPLICA: kill and restart individual replica."
         ),
     )
+
+    @validator("runtime_failure_policy", always=True)
+    def _validate_runtime_failure_policy(cls, v):
+        if v == GangRuntimeFailurePolicy.RESTART_REPLICA:
+            raise NotImplementedError(
+                "RESTART_REPLICA policy is not yet implemented."
+            )
+        return v
diff --git a/python/ray/serve/tests/test_deployment_scheduler.py b/python/ray/serve/tests/test_deployment_scheduler.py
@@ -951,25 +951,18 @@ def __call__(self):
         assert len(gangs) == 2
 
         for gang_id, members in gangs.items():
-            # Each gang should have exactly 2 replicas
             assert len(members) == 2
-
-            # All members should have the same world_size
             assert all(member["world_size"] == 2 for member in members)
-
-            # All members should have the same member_replica_ids
             assert members[0]["member_replica_ids"] == members[1]["member_replica_ids"]
 
-            # member_replica_ids should contain exactly the 2 replica IDs in this gang
             expected_ids = sorted([m["replica_id"] for m in members])
             actual_ids = sorted(members[0]["member_replica_ids"])
             assert actual_ids == expected_ids
 
-            # Ranks within the gang should be {0, 1}
             ranks = sorted([m["rank"] for m in members])
             assert ranks == [0, 1]
 
-        # Across gangs: gang_ids should be different (already guaranteed by dict keys)
+        # Across gangs: gang_ids should be different
         gang_ids = list(gangs.keys())
         assert gang_ids[0] != gang_ids[1]
 
diff --git a/python/ray/serve/tests/test_healthcheck.py b/python/ray/serve/tests/test_healthcheck.py
diff --git a/python/ray/serve/tests/unit/test_config.py b/python/ray/serve/tests/unit/test_config.py