-
Notifications
You must be signed in to change notification settings - Fork 7.2k
[Data] Add descriptive error when using local:// paths with a zero-resource head node
#60709
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -368,6 +368,94 @@ def _is_local_scheme(paths: Union[str, List[str]]) -> bool: | |
| return num == len(paths) | ||
|
|
||
|
|
||
| def _validate_head_node_resources_for_local_scheduling( | ||
| ray_remote_args: Dict[str, Any], | ||
| *, | ||
| op_name: str, | ||
| required_num_cpus: int = 1, | ||
| required_num_gpus: int = 0, | ||
| required_memory: int = 0, | ||
| ) -> None: | ||
| """Ensure the head node has enough resources before pinning work there. | ||
|
|
||
| Local paths (``local://``) and other driver-local I/O force tasks onto the | ||
| head node via ``NodeAffinitySchedulingStrategy``. If the head node was | ||
| intentionally started with zero logical resources (a common practice to | ||
| avoid OOMs), those tasks become unschedulable. Detect this upfront and | ||
| raise a clear error with remediation steps. | ||
| """ | ||
|
|
||
| # Ray defaults to reserving 1 CPU per task when num_cpus isn't provided. | ||
| num_cpus = ray_remote_args.get("num_cpus", required_num_cpus) | ||
| num_gpus = ray_remote_args.get("num_gpus", required_num_gpus) | ||
| memory = ray_remote_args.get("memory", required_memory) | ||
|
|
||
| required_resources: Dict[str, float] = {} | ||
| if num_cpus > 0: | ||
| required_resources["CPU"] = float(num_cpus) | ||
| if num_gpus > 0: | ||
| required_resources["GPU"] = float(num_gpus) | ||
| if memory > 0: | ||
| required_resources["memory"] = float(memory) | ||
|
|
||
| # Include any additional custom resources requested. | ||
| custom_resources = ray_remote_args.get("resources", {}) | ||
| for name, amount in custom_resources.items(): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing None handling for resources dict causes AttributeErrorMedium Severity Similar to the standard resource fields, if |
||
| if amount is None: | ||
| continue | ||
| try: | ||
| amount = float(amount) | ||
| except (TypeError, ValueError) as err: | ||
| raise ValueError(f"Invalid resource amount for '{name}': {amount}") from err | ||
| if amount > 0: | ||
| required_resources[name] = amount | ||
|
|
||
| # If there are no positive resource requirements, the task can run on a | ||
| # zero-resource head node (e.g., num_cpus=0 opt-out), so nothing to check. | ||
| if not required_resources: | ||
| return | ||
|
|
||
| head_node = next( | ||
| ( | ||
| node | ||
| for node in ray.nodes() | ||
| if node.get("Alive") | ||
| and "node:__internal_head__" in node.get("Resources", {}) | ||
| ), | ||
| None, | ||
| ) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Validation checks head node but tasks scheduled elsewhereMedium Severity The validation function explicitly searches for the head node using Additional Locations (2) |
||
| if not head_node: | ||
| # The head node metadata is unavailable (e.g., during shutdown). Fall back | ||
| # to the default behavior and let Ray surface its own error. | ||
| return | ||
|
Comment on lines
+427
to
+430
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For my understanding, do u have a script of when this can occur? (
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the question. |
||
|
|
||
| # Build a map of required vs available resources on the head node. | ||
| head_resources: Dict[str, float] = head_node.get("Resources", {}) | ||
| # Map: resource name -> (required, available). | ||
| insufficient: Dict[str, Tuple[float, float]] = {} | ||
| for name, req in required_resources.items(): | ||
| avail = head_resources.get(name, 0.0) | ||
| if avail < req: | ||
| insufficient[name] = (req, avail) | ||
|
|
||
| # If nothing is below the required amount, we are good to proceed. | ||
| if not insufficient: | ||
| return | ||
|
|
||
| details = "; ".join( | ||
| f"{name} required {req:g} but head has {avail:g}" | ||
| for name, (req, avail) in insufficient.items() | ||
| ) | ||
|
|
||
| raise ValueError( | ||
| f"{op_name} must run on the head node (e.g., for local:// paths), " | ||
| f"but the head node doesn't have enough resources: {details}. " | ||
| "Add resources to the head node, switch to a shared filesystem instead " | ||
| "of local://, or set the resource requests on this operation to 0 " | ||
| "(for example, num_cpus=0) so it can run without head resources." | ||
| ) | ||
|
|
||
|
|
||
| def _truncated_repr(obj: Any) -> str: | ||
| """Utility to return a truncated object representation for error messages.""" | ||
| msg = str(obj) | ||
|
|
||


There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing None handling for standard resource arguments
Medium Severity
The standard resources (
num_cpus,num_gpus,memory) use.get()with a default value, but this only applies when the key is absent. Ifray_remote_argscontains an explicitNonevalue (e.g.,{"num_cpus": None}), the.get()returnsNone, and the subsequent comparison likenum_cpus > 0raises aTypeError. This is inconsistent with the custom resources handling at lines 404-405, which explicitly checks for and skipsNonevalues.