[Data] Delay 'cluster resources not enough' warning until operator is persistently starved (#63969)

bveeramani · web-flow · commit a915fe0da247 · 2026-06-09T23:02:14.000Z
## Description A dataset's resource allocator depends on the `AutoscalingCoordinator` server to get its share of allocated resources. To improve reliability, #62725 made calls to the server non-blocking. One consequence of this change is that the dataset gets zero resources at the very start of execution while it waits for the first response from the autoscaling coordnanator. As a result, we'd consistently emit spurious warnings like this at the start of execution: ``` Cluster resources are not enough to run any task from TaskPoolMapOperator[ReadRange]. The job may hang forever unless the cluster scales up. ``` To avoid this confusion, I've made it so that we only emit the warning after the first eligible operator has been starved for a minute. ## Related issues ## Additional information --------- Signed-off-by: Balaji Veeramani <bveeramani@berkeley.edu>
diff --git a/python/ray/data/_internal/execution/resource_manager.py b/python/ray/data/_internal/execution/resource_manager.py
@@ -39,6 +39,13 @@
     "RAY_DATA_DEBUG_RESOURCE_MANAGER", None
 )
 
+# Only warn that the cluster can't run any task once the operator has been starved of
+# its minimum resources for this long. This avoids spurious warnings while the cluster
+# is still scaling up or waiting for a response from the autoscaling coordinator.
+#
+# I arbitrarily chose the default delay.
+STARVATION_WARNING_DELAY_S = env_float("RAY_DATA_STARVATION_WARNING_DELAY_S", 60)
+
 
 # Following list is a list of *blocking* materializing operators, that prevent
 # operators downstream from them from starting execution until these operators
@@ -705,6 +712,9 @@ def __init__(self, resource_manager: ResourceManager, reservation_ratio: float):
         # enough to run one task of each op.
         # See `test_no_deadlock_on_small_cluster_resources` as an example.
         self._reserved_min_resources: Dict[PhysicalOperator, bool] = {}
+        # `time.monotonic()` timestamp at which each operator most recently became
+        # starved of its minimum resources, or None if it currently has them.
+        self._op_starved_since: Dict[PhysicalOperator, Optional[float]] = {}
 
     def _update_reservation(self, limits: ExecutionResources):
         eligible_ops = self._resource_manager.get_eligible_ops()
@@ -746,27 +756,27 @@ def _update_reservation(self, limits: ExecutionResources):
                 remaining, ignore_object_store_memory=True
             ):
                 self._reserved_min_resources[op] = True
+                self._op_starved_since[op] = None
             else:
+                self._reserved_min_resources[op] = False
+                if self._op_starved_since.get(op) is None:
+                    self._op_starved_since[op] = time.monotonic()
+
                 # If the remaining resources are not enough to reserve the minimum
                 # resources for this operator, we'll only reserve the minimum object
                 # store memory, but not the CPU and GPU resources.
                 # Because Ray Core doesn't allow CPU/GPU resources to be oversubscribed.
                 # NOTE: we prioritize upstream operators for minimum resource reservation.
                 # ops. It's fine that downstream ops don't get the minimum reservation,
                 # because they can wait for upstream ops to finish and release resources.
-                self._reserved_min_resources[op] = False
                 reserved_for_tasks = ExecutionResources(
                     0, 0, min_resource_usage.object_store_memory
                 )
-                # Add `id(self)` to the log_once key so that it will be logged once
-                # per execution.
-                if index == 0 and log_once(f"low_resource_warning_{id(self)}"):
-                    # Log a warning if even the first operator cannot reserve
-                    # the minimum resources.
-                    logger.warning(
-                        f"Cluster resources are not enough to run any task from {op}."
-                        " The job may hang forever unless the cluster scales up."
-                    )
+
+            # Log a warning if even the first operator cannot reserve the minimum
+            # resources.
+            if index == 0:
+                self._warn_if_op_starved_too_long(op)
 
             self._op_reserved[op] = reserved_for_tasks
             self._reserved_for_op_outputs[op] = reserved_for_outputs.object_store_memory
@@ -777,6 +787,23 @@ def _update_reservation(self, limits: ExecutionResources):
 
         self._total_shared = remaining
 
+    def _warn_if_op_starved_too_long(self, op: PhysicalOperator) -> None:
+        # The operator isn't starved. Return early.
+        if self._op_starved_since.get(op) is None:
+            return
+
+        op_starved_duration = time.monotonic() - self._op_starved_since[op]
+        if (
+            op_starved_duration >= STARVATION_WARNING_DELAY_S
+            # Add `id(self)` to the log_once key so that it will be logged once per
+            # execution.
+            and log_once(f"starvation_warning_{id(self)}")
+        ):
+            logger.warning(
+                f"Cluster resources are not enough to run any task from {op}."
+                " The job may hang forever unless the cluster scales up."
+            )
+
     def can_submit_new_task(self, op: PhysicalOperator) -> bool:
         """Return whether the given operator can submit a new task based on budget."""
         budget = self.get_budget(op)