Skip to content

Commit b60f662

Browse files
committed
dont decrement on fail
Signed-off-by: abrar <abrar@anyscale.com>
1 parent 76faff6 commit b60f662

File tree

3 files changed

+165
-8
lines changed

3 files changed

+165
-8
lines changed

ci/lint/pydoclint-baseline.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1513,9 +1513,6 @@ python/ray/serve/_private/controller.py
15131513
python/ray/serve/_private/deploy_utils.py
15141514
DOC201: Function `get_app_code_version` does not have a return section in docstring
15151515
--------------------
1516-
python/ray/serve/_private/deployment_scheduler.py
1517-
DOC201: Method `DeploymentScheduler._schedule_replica` does not have a return section in docstring
1518-
--------------------
15191516
python/ray/serve/_private/deployment_state.py
15201517
DOC201: Method `ReplicaStateContainer.get` does not have a return section in docstring
15211518
DOC201: Method `ReplicaStateContainer.pop` does not have a return section in docstring

python/ray/serve/_private/deployment_scheduler.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -533,7 +533,7 @@ def _schedule_replica(
533533
default_scheduling_strategy: str,
534534
target_node_id: Optional[str] = None,
535535
target_labels: Optional[LabelMatchExpressionsT] = None,
536-
):
536+
) -> bool:
537537
"""Schedule a replica from a scheduling request.
538538
539539
The following special scheduling strategies will be used, in
@@ -555,6 +555,9 @@ def _schedule_replica(
555555
target node.
556556
target_labels: Attempt to schedule this replica onto nodes
557557
with these target labels.
558+
559+
Returns:
560+
True if the replica was successfully scheduled, False otherwise.
558561
"""
559562

560563
replica_id = scheduling_request.replica_id
@@ -588,7 +591,7 @@ def _schedule_replica(
588591
scheduling_request.status = (
589592
ReplicaSchedulingRequestStatus.PLACEMENT_GROUP_CREATION_FAILED
590593
)
591-
return
594+
return False
592595
scheduling_strategy = PlacementGroupSchedulingStrategy(
593596
placement_group=pg,
594597
placement_group_capture_child_tasks=True,
@@ -629,7 +632,7 @@ def _schedule_replica(
629632
scheduling_request.status = (
630633
ReplicaSchedulingRequestStatus.ACTOR_CREATION_FAILED
631634
)
632-
return
635+
return False
633636

634637
del self._pending_replicas[deployment_id][replica_id]
635638
self._on_replica_launching(
@@ -641,6 +644,7 @@ def _schedule_replica(
641644

642645
scheduling_request.status = ReplicaSchedulingRequestStatus.SUCCEEDED
643646
scheduling_request.on_scheduled(actor_handle, placement_group=placement_group)
647+
return True
644648

645649
@abstractmethod
646650
def get_node_to_compact(
@@ -801,13 +805,13 @@ def _pack_schedule_replica(
801805
if target_node:
802806
break
803807

804-
self._schedule_replica(
808+
succeeded = self._schedule_replica(
805809
scheduling_request,
806810
default_scheduling_strategy="DEFAULT",
807811
target_node_id=target_node,
808812
)
809813

810-
return target_node
814+
return target_node if succeeded else None
811815

812816
def _build_pack_placement_candidates(
813817
self, scheduling_request: ReplicaSchedulingRequest

python/ray/serve/tests/unit/test_deployment_scheduler.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
DeploymentDownscaleRequest,
2020
DeploymentSchedulingInfo,
2121
ReplicaSchedulingRequest,
22+
ReplicaSchedulingRequestStatus,
2223
Resources,
2324
SpreadDeploymentSchedulingPolicy,
2425
)
@@ -1424,6 +1425,161 @@ def on_scheduled(actor_handle, placement_group):
14241425
downscales={},
14251426
)
14261427

1428+
def test_actor_creation_failure_does_not_decrement_resources(self):
1429+
"""When actor creation fails for a replica, available resources
1430+
should not be decremented so subsequent replicas in the same
1431+
scheduling batch can still use that node.
1432+
"""
1433+
1434+
d_id = DeploymentID(name="deployment1")
1435+
node_id = NodeID.from_random().hex()
1436+
1437+
cluster_node_info_cache = MockClusterNodeInfoCache()
1438+
# Node has exactly 2 CPUs — enough for two 1-CPU replicas.
1439+
cluster_node_info_cache.add_node(node_id, {"CPU": 2})
1440+
1441+
scheduler = default_impl.create_deployment_scheduler(
1442+
cluster_node_info_cache,
1443+
head_node_id_override="fake-head-node-id",
1444+
create_placement_group_fn_override=None,
1445+
)
1446+
scheduler.on_deployment_created(d_id, SpreadDeploymentSchedulingPolicy())
1447+
scheduler.on_deployment_deployed(
1448+
d_id,
1449+
ReplicaConfig.create(dummy, ray_actor_options={"num_cpus": 1}),
1450+
)
1451+
1452+
# Create a mock actor class whose .options().remote() raises on the
1453+
# first call (simulating actor creation failure) but succeeds after.
1454+
call_count = 0
1455+
1456+
class FailOnceMockActorClass(MockActorClass):
1457+
def remote(self, *args):
1458+
nonlocal call_count
1459+
call_count += 1
1460+
if call_count == 1:
1461+
raise RuntimeError("Simulated actor creation failure")
1462+
return super().remote(*args)
1463+
1464+
on_scheduled_mock = Mock()
1465+
r0_id = ReplicaID(unique_id="r0", deployment_id=d_id)
1466+
r1_id = ReplicaID(unique_id="r1", deployment_id=d_id)
1467+
1468+
req0 = ReplicaSchedulingRequest(
1469+
replica_id=r0_id,
1470+
actor_def=FailOnceMockActorClass(),
1471+
actor_resources={"CPU": 1},
1472+
actor_options={},
1473+
actor_init_args=(),
1474+
on_scheduled=on_scheduled_mock,
1475+
)
1476+
req1 = ReplicaSchedulingRequest(
1477+
replica_id=r1_id,
1478+
actor_def=MockActorClass(),
1479+
actor_resources={"CPU": 1},
1480+
actor_options={},
1481+
actor_init_args=(),
1482+
on_scheduled=on_scheduled_mock,
1483+
)
1484+
1485+
scheduler.schedule(
1486+
upscales={d_id: [req0, req1]},
1487+
downscales={},
1488+
)
1489+
1490+
# The first replica should have failed.
1491+
assert req0.status == ReplicaSchedulingRequestStatus.ACTOR_CREATION_FAILED
1492+
1493+
# The second replica should have succeeded and been scheduled to the
1494+
# node.
1495+
assert req1.status == ReplicaSchedulingRequestStatus.SUCCEEDED
1496+
assert on_scheduled_mock.call_count == 1
1497+
call = on_scheduled_mock.call_args_list[0]
1498+
scheduling_strategy = call.args[0]._options["scheduling_strategy"]
1499+
assert isinstance(scheduling_strategy, NodeAffinitySchedulingStrategy)
1500+
assert scheduling_strategy.node_id == node_id
1501+
1502+
def test_pg_creation_failure_does_not_decrement_resources(self):
1503+
"""When placement group creation fails for a replica, available
1504+
resources should not be decremented so subsequent replicas in the
1505+
same scheduling batch can still use that node.
1506+
"""
1507+
1508+
d_id = DeploymentID(name="deployment1")
1509+
node_id = NodeID.from_random().hex()
1510+
1511+
cluster_node_info_cache = MockClusterNodeInfoCache()
1512+
# Node has exactly 2 CPUs — enough for two replicas with 1-CPU PGs.
1513+
cluster_node_info_cache.add_node(node_id, {"CPU": 2})
1514+
1515+
call_count = 0
1516+
1517+
def fail_once_create_pg(request):
1518+
nonlocal call_count
1519+
call_count += 1
1520+
if call_count == 1:
1521+
raise RuntimeError("Simulated PG creation failure")
1522+
return MockPlacementGroup(request)
1523+
1524+
scheduler = default_impl.create_deployment_scheduler(
1525+
cluster_node_info_cache,
1526+
head_node_id_override="fake-head-node-id",
1527+
create_placement_group_fn_override=fail_once_create_pg,
1528+
)
1529+
scheduler.on_deployment_created(d_id, SpreadDeploymentSchedulingPolicy())
1530+
scheduler.on_deployment_deployed(
1531+
d_id,
1532+
ReplicaConfig.create(
1533+
dummy,
1534+
ray_actor_options={"num_cpus": 0},
1535+
placement_group_bundles=[{"CPU": 1}],
1536+
placement_group_strategy="STRICT_PACK",
1537+
),
1538+
)
1539+
1540+
on_scheduled_mock = Mock()
1541+
r0_id = ReplicaID(unique_id="r0", deployment_id=d_id)
1542+
r1_id = ReplicaID(unique_id="r1", deployment_id=d_id)
1543+
1544+
req0 = ReplicaSchedulingRequest(
1545+
replica_id=r0_id,
1546+
actor_def=MockActorClass(),
1547+
actor_resources={"CPU": 0},
1548+
placement_group_bundles=[{"CPU": 1}],
1549+
placement_group_strategy="STRICT_PACK",
1550+
actor_options={"name": "r0"},
1551+
actor_init_args=(),
1552+
on_scheduled=on_scheduled_mock,
1553+
)
1554+
req1 = ReplicaSchedulingRequest(
1555+
replica_id=r1_id,
1556+
actor_def=MockActorClass(),
1557+
actor_resources={"CPU": 0},
1558+
placement_group_bundles=[{"CPU": 1}],
1559+
placement_group_strategy="STRICT_PACK",
1560+
actor_options={"name": "r1"},
1561+
actor_init_args=(),
1562+
on_scheduled=on_scheduled_mock,
1563+
)
1564+
1565+
scheduler.schedule(
1566+
upscales={d_id: [req0, req1]},
1567+
downscales={},
1568+
)
1569+
1570+
# The first replica should have failed at PG creation.
1571+
assert (
1572+
req0.status
1573+
== ReplicaSchedulingRequestStatus.PLACEMENT_GROUP_CREATION_FAILED
1574+
)
1575+
1576+
# The second replica should still succeed.
1577+
assert req1.status == ReplicaSchedulingRequestStatus.SUCCEEDED
1578+
assert on_scheduled_mock.call_count == 1
1579+
call = on_scheduled_mock.call_args_list[0]
1580+
scheduling_strategy = call.args[0]._options["scheduling_strategy"]
1581+
assert isinstance(scheduling_strategy, PlacementGroupSchedulingStrategy)
1582+
14271583

14281584
if __name__ == "__main__":
14291585
sys.exit(pytest.main(["-v", "-s", __file__]))

0 commit comments

Comments
 (0)