-
Notifications
You must be signed in to change notification settings - Fork 7.2k
[Core] Add fallback strategy support to placement groups #59024
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
ryanaoleary
wants to merge
14
commits into
ray-project:master
Choose a base branch
from
ryanaoleary:pg-fallback-strategy
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
629f7e2
Fix placement group bundle semantics and fallback logic
ryanaoleary 046deff
Merge branch 'master' into pg-fallback-strategy
ryanaoleary 893dee4
Merge branch 'master' into pg-fallback-strategy
ryanaoleary cc7a8a4
Add back changes dropped in bad rebase
ryanaoleary b42757b
Merge branch 'master' into pg-fallback-strategy
ryanaoleary f69cbaa
Fix loop to iterate over bundles_source
ryanaoleary fcefe96
Fix cursor comments and rename field to be clear its not strategy
ryanaoleary 2306900
Fix erroneously named field
ryanaoleary 3a74c66
Fix cacheing in placement group
ryanaoleary 27e4a6e
Clang format
ryanaoleary 0c5a113
Merge branch 'master' into pg-fallback-strategy
ryanaoleary 8f6bbcb
Fix truthiness of cache check
ryanaoleary 972ad3c
Merge branch 'master' into pg-fallback-strategy
ryanaoleary 060e03d
Fix cache call and comment
ryanaoleary File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ryanaoleary marked this conversation as resolved.
Show resolved
Hide resolved
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,233 @@ | ||
| import pytest | ||
|
|
||
| import ray | ||
| from ray._private.test_utils import placement_group_assert_no_leak | ||
| from ray.util.placement_group import ( | ||
| placement_group, | ||
| placement_group_table, | ||
| remove_placement_group, | ||
| ) | ||
|
|
||
|
|
||
| def test_placement_group_fallback_resources(ray_start_cluster): | ||
| """Test fallback based on resource bundles.""" | ||
| cluster = ray_start_cluster | ||
| cluster.add_node(num_cpus=4) | ||
| ray.init(address=cluster.address) | ||
|
|
||
| # Feasible fallback strategy with bundles requesting <= available CPU. | ||
| fallback_strategy = [{"bundles": [{"CPU": 4}]}] | ||
|
|
||
| pg = placement_group( | ||
| name="resource_fallback_pg", | ||
| bundles=[{"CPU": 8}], # Infeasible initial bundle request. | ||
| strategy="PACK", | ||
| fallback_strategy=fallback_strategy, | ||
| ) | ||
ryanaoleary marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # Placement group is scheduled using fallback. | ||
| ray.get(pg.ready(), timeout=10) | ||
|
|
||
| # Example task to try to schedule to used node. | ||
| @ray.remote(num_cpus=1) | ||
| def check_capacity(): | ||
| return "ok" | ||
|
|
||
| # Example task times out because all CPU used by placement group. | ||
| with pytest.raises(ray.exceptions.GetTimeoutError): | ||
| ray.get(check_capacity.remote(), timeout=2) | ||
|
|
||
| remove_placement_group(pg) | ||
| placement_group_assert_no_leak([pg]) | ||
|
|
||
|
|
||
| def test_placement_group_fallback_strategy_labels(ray_start_cluster): | ||
| """ | ||
| Test that fallback strategy is used when primary bundles are not feasible | ||
| due to label constraints. | ||
| """ | ||
| cluster = ray_start_cluster | ||
| cluster.add_node(num_cpus=2, labels={}) # Unlabelled node | ||
| cluster.add_node(num_cpus=2, labels={"region": "us-west1"}) | ||
| ray.init(address=cluster.address) | ||
|
|
||
| fallback_strategy = [ | ||
| {"bundles": [{"CPU": 2}], "bundle_label_selector": [{"region": "us-west1"}]} | ||
| ] | ||
|
|
||
| pg = placement_group( | ||
| name="fallback_pg", | ||
| bundles=[{"CPU": 2}], | ||
| bundle_label_selector=[{"region": "us-east1"}], # Infeasible label | ||
| strategy="PACK", | ||
| fallback_strategy=fallback_strategy, # Feasible fallback | ||
| ) | ||
|
|
||
| # Succeeds due to fallback option | ||
| ray.get(pg.ready(), timeout=10) | ||
|
|
||
| # Verify it was scheduled on the correct node | ||
| table = placement_group_table(pg) | ||
| bundle_node_id = table["bundles_to_node_id"][0] | ||
|
|
||
| found = False | ||
| for node in ray.nodes(): | ||
| if node["NodeID"] == bundle_node_id: | ||
| assert node["Labels"]["region"] == "us-west1" | ||
| found = True | ||
| break | ||
| assert found, "Scheduled node not found in cluster state" | ||
|
|
||
| remove_placement_group(pg) | ||
| placement_group_assert_no_leak([pg]) | ||
|
|
||
|
|
||
| def test_placement_group_fallback_priority(ray_start_cluster): | ||
| """Test that the first feasible fallback option is chosen from multiple feasible fallbacks.""" | ||
| cluster = ray_start_cluster | ||
| # Node has 10 CPUs | ||
| cluster.add_node(num_cpus=10) | ||
| ray.init(address=cluster.address) | ||
|
|
||
| fallback_strategy = [ | ||
| {"bundles": [{"CPU": 11}]}, # Infeasible | ||
| {"bundles": [{"CPU": 5}]}, # Feasible | ||
| {"bundles": [{"CPU": 1}]}, # Feasible | ||
| ] | ||
ryanaoleary marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| pg = placement_group( | ||
| name="priority_pg", | ||
| bundles=[{"CPU": 20}], # Infeasible main bundles. | ||
| strategy="PACK", | ||
| fallback_strategy=fallback_strategy, | ||
| ) | ||
|
|
||
| ray.get(pg.ready(), timeout=10) | ||
|
|
||
| # Verify we consumed 5 CPUs, not 1. | ||
| @ray.remote(num_cpus=6) | ||
| def heavy_task(): | ||
| return "ok" | ||
|
|
||
| with pytest.raises(ray.exceptions.GetTimeoutError): | ||
| ray.get(heavy_task.remote(), timeout=2) | ||
|
|
||
| remove_placement_group(pg) | ||
| placement_group_assert_no_leak([pg]) | ||
|
|
||
|
|
||
| def test_placement_group_fallback_bundle_shapes(ray_start_cluster): | ||
| """Test fallback works even when changing the number of bundles.""" | ||
| cluster = ray_start_cluster | ||
| cluster.add_node(num_cpus=1) | ||
| cluster.add_node(num_cpus=1) | ||
| ray.init(address=cluster.address) | ||
|
|
||
| # Feasible fallback specifies 2 bundles with 1 CPU each (rather than 1 bundle | ||
| # with 2 CPU). | ||
| fallback_strategy = [{"bundles": [{"CPU": 1}, {"CPU": 1}]}] | ||
|
|
||
| pg = placement_group( | ||
| name="reshape_pg", | ||
| bundles=[{"CPU": 2}], # Infeasible 2 CPU bundle on any node. | ||
| strategy="SPREAD", | ||
| fallback_strategy=fallback_strategy, | ||
| ) | ||
|
|
||
| ray.get(pg.ready(), timeout=10) | ||
|
|
||
| table = placement_group_table(pg) | ||
| assert len(table["bundles"]) == 2 | ||
|
|
||
| remove_placement_group(pg) | ||
| placement_group_assert_no_leak([pg]) | ||
|
|
||
|
|
||
| def test_multiple_placement_groups_and_fallbacks(ray_start_cluster): | ||
| """ | ||
| Test that multiple placement groups with fallback strategies correctly subtract | ||
| from available resources in the cluster. | ||
| """ | ||
| cluster = ray_start_cluster | ||
| cluster.add_node(num_cpus=10) | ||
| ray.init(address=cluster.address) | ||
|
|
||
| # Define a fallback strategy that uses 3 CPUs. | ||
| fallback_strategy = [{"bundles": [{"CPU": 3}]}] | ||
|
|
||
| pgs = [] | ||
| for i in range(3): | ||
| pg = placement_group( | ||
| name=f"pg_{i}", | ||
| bundles=[{"CPU": 100}], # Infeasible | ||
| strategy="PACK", | ||
| fallback_strategy=fallback_strategy, | ||
| ) | ||
| pgs.append(pg) | ||
|
|
||
| # Create 3 PGs that should all use the fallback strategy. | ||
| for pg in pgs: | ||
| ray.get(pg.ready(), timeout=10) | ||
|
|
||
| # Verify we can still schedule a task utilizing the last CPU (10 total - 9 used by PGs). | ||
| @ray.remote(num_cpus=1) | ||
| def small_task(): | ||
| return "ok" | ||
|
|
||
| assert ray.get(small_task.remote(), timeout=5) == "ok" | ||
|
|
||
| # Validate PGs with fallback correctly subtract from the available cluster resources to where | ||
| # a task requesting more CPU than is available times out. | ||
| @ray.remote(num_cpus=2) | ||
| def large_task(): | ||
| return "fail" | ||
|
|
||
| with pytest.raises(ray.exceptions.GetTimeoutError): | ||
| ray.get(large_task.remote(), timeout=2) | ||
|
|
||
| for pg in pgs: | ||
| remove_placement_group(pg) | ||
| placement_group_assert_no_leak(pgs) | ||
|
|
||
|
|
||
| def test_placement_group_fallback_validation(ray_start_cluster): | ||
| """ | ||
| Verifies that PG validates resource shape with both primary and fallback bundles. | ||
| """ | ||
| cluster = ray_start_cluster | ||
| cluster.add_node(num_cpus=4, num_gpus=0) | ||
| ray.init(address=cluster.address) | ||
|
|
||
| pg = placement_group( | ||
| name="validation_pg", | ||
| bundles=[{"GPU": 1}], | ||
| strategy="PACK", | ||
| fallback_strategy=[{"bundles": [{"CPU": 1}]}], | ||
| ) | ||
|
|
||
| # Task requires CPU, primary option has only GPU. | ||
| # The client-side validation logic should check the fallback strategy | ||
| # and allow this task to proceed. | ||
| @ray.remote(num_cpus=1) | ||
| def run_on_cpu(): | ||
| return "success" | ||
|
|
||
| try: | ||
| # If client-side validation fails, this raises ValueError immediately. | ||
| ref = run_on_cpu.options(placement_group=pg).remote() | ||
| assert ray.get(ref) == "success" | ||
| except ValueError as e: | ||
| pytest.fail(f"Validation failed for fallback-compatible task: {e}") | ||
|
|
||
| # Verify bundle_specs contains active bundles. | ||
| ray.get(pg.ready()) | ||
| assert pg.bundle_specs[0].get("CPU") == 1 | ||
| assert pg.bundle_specs[0].get("GPU") is None | ||
|
|
||
| remove_placement_group(pg) | ||
| placement_group_assert_no_leak([pg]) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| import sys | ||
|
|
||
| sys.exit(pytest.main(["-v", __file__])) | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.