|
11 | 11 | wait_for_condition, |
12 | 12 | wait_for_pid_to_exit, |
13 | 13 | SignalActor, |
| 14 | + Semaphore, |
14 | 15 | ) |
| 16 | +from ray.internal.internal_api import memory_summary |
15 | 17 |
|
16 | 18 | SIGKILL = signal.SIGKILL if sys.platform != "win32" else signal.SIGTERM |
17 | 19 |
|
| 20 | +# Task status. |
| 21 | +WAITING_FOR_DEPENDENCIES = "WAITING_FOR_DEPENDENCIES" |
| 22 | +SCHEDULED = "SCHEDULED" |
| 23 | +FINISHED = "FINISHED" |
| 24 | + |
18 | 25 |
|
19 | 26 | def test_cached_object(ray_start_cluster): |
20 | 27 | config = { |
@@ -1014,6 +1021,85 @@ def dependent_task(x): |
1014 | 1021 | ray.get(obj, timeout=60) |
1015 | 1022 |
|
1016 | 1023 |
|
| 1024 | +def test_memory_util(ray_start_cluster): |
| 1025 | + config = { |
| 1026 | + "num_heartbeats_timeout": 10, |
| 1027 | + "raylet_heartbeat_period_milliseconds": 100, |
| 1028 | + "object_timeout_milliseconds": 200, |
| 1029 | + } |
| 1030 | + |
| 1031 | + cluster = ray_start_cluster |
| 1032 | + # Head node with no resources. |
| 1033 | + cluster.add_node( |
| 1034 | + num_cpus=0, |
| 1035 | + resources={"head": 1}, |
| 1036 | + _system_config=config, |
| 1037 | + enable_object_reconstruction=True, |
| 1038 | + ) |
| 1039 | + ray.init(address=cluster.address) |
| 1040 | + # Node to place the initial object. |
| 1041 | + node_to_kill = cluster.add_node( |
| 1042 | + num_cpus=1, resources={"node1": 1}, object_store_memory=10 ** 8 |
| 1043 | + ) |
| 1044 | + cluster.wait_for_nodes() |
| 1045 | + |
| 1046 | + @ray.remote |
| 1047 | + def large_object(sema=None): |
| 1048 | + if sema is not None: |
| 1049 | + ray.get(sema.acquire.remote()) |
| 1050 | + return np.zeros(10 ** 7, dtype=np.uint8) |
| 1051 | + |
| 1052 | + @ray.remote |
| 1053 | + def dependent_task(x, sema): |
| 1054 | + ray.get(sema.acquire.remote()) |
| 1055 | + return x |
| 1056 | + |
| 1057 | + def stats(): |
| 1058 | + info = memory_summary(cluster.address, line_wrap=False) |
| 1059 | + info = info.split("\n") |
| 1060 | + reconstructing_waiting = [ |
| 1061 | + line |
| 1062 | + for line in info |
| 1063 | + if "Attempt #2" in line and WAITING_FOR_DEPENDENCIES in line |
| 1064 | + ] |
| 1065 | + reconstructing_scheduled = [ |
| 1066 | + line for line in info if "Attempt #2" in line and SCHEDULED in line |
| 1067 | + ] |
| 1068 | + reconstructing_finished = [ |
| 1069 | + line for line in info if "Attempt #2" in line and FINISHED in line |
| 1070 | + ] |
| 1071 | + return ( |
| 1072 | + len(reconstructing_waiting), |
| 1073 | + len(reconstructing_scheduled), |
| 1074 | + len(reconstructing_finished), |
| 1075 | + ) |
| 1076 | + |
| 1077 | + sema = Semaphore.options(resources={"head": 1}).remote(value=0) |
| 1078 | + obj = large_object.options(resources={"node1": 1}).remote(sema) |
| 1079 | + x = dependent_task.options(resources={"node1": 1}).remote(obj, sema) |
| 1080 | + ref = dependent_task.options(resources={"node1": 1}).remote(x, sema) |
| 1081 | + ray.get(sema.release.remote()) |
| 1082 | + ray.get(sema.release.remote()) |
| 1083 | + ray.get(sema.release.remote()) |
| 1084 | + ray.get(ref) |
| 1085 | + wait_for_condition(lambda: stats() == (0, 0, 0)) |
| 1086 | + del ref |
| 1087 | + |
| 1088 | + cluster.remove_node(node_to_kill, allow_graceful=False) |
| 1089 | + node_to_kill = cluster.add_node( |
| 1090 | + num_cpus=1, resources={"node1": 1}, object_store_memory=10 ** 8 |
| 1091 | + ) |
| 1092 | + |
| 1093 | + ref = dependent_task.remote(x, sema) |
| 1094 | + wait_for_condition(lambda: stats() == (1, 1, 0)) |
| 1095 | + ray.get(sema.release.remote()) |
| 1096 | + wait_for_condition(lambda: stats() == (0, 1, 1)) |
| 1097 | + ray.get(sema.release.remote()) |
| 1098 | + ray.get(sema.release.remote()) |
| 1099 | + ray.get(ref) |
| 1100 | + wait_for_condition(lambda: stats() == (0, 0, 2)) |
| 1101 | + |
| 1102 | + |
1017 | 1103 | if __name__ == "__main__": |
1018 | 1104 | import pytest |
1019 | 1105 |
|
|
0 commit comments