Skip to content

Commit 9ba8f72

Browse files
AmeerHajAliedoakes
authored andcommitted
[autoscaler] Add the cluster_name to docker file mounts directory prefix to make it more unique (#11600)
1 parent bcc92f5 commit 9ba8f72

File tree

6 files changed

+56
-30
lines changed

6 files changed

+56
-30
lines changed

python/ray/autoscaler/_private/command_runner.py

+18-6
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
check_docker_running_cmd, \
1717
check_docker_image, \
1818
docker_start_cmds, \
19-
DOCKER_MOUNT_PREFIX, \
2019
with_docker_exec
2120
from ray.autoscaler._private.log_timer import LogTimer
2221

@@ -292,6 +291,7 @@ def __init__(self, log_prefix, node_id, provider, auth_config,
292291
ssh_user_hash[:HASH_MAX_LENGTH],
293292
ssh_control_hash[:HASH_MAX_LENGTH])
294293

294+
self.cluster_name = cluster_name
295295
self.log_prefix = log_prefix
296296
self.process_runner = process_runner
297297
self.node_id = node_id
@@ -597,8 +597,9 @@ def run(
597597

598598
def run_rsync_up(self, source, target, options=None):
599599
options = options or {}
600-
host_destination = os.path.join(DOCKER_MOUNT_PREFIX,
601-
target.lstrip("/"))
600+
host_destination = os.path.join(
601+
self._get_docker_host_mount_location(
602+
self.ssh_command_runner.cluster_name), target.lstrip("/"))
602603

603604
self.ssh_command_runner.run(
604605
f"mkdir -p {os.path.dirname(host_destination.rstrip('/'))}")
@@ -617,7 +618,9 @@ def run_rsync_up(self, source, target, options=None):
617618

618619
def run_rsync_down(self, source, target, options=None):
619620
options = options or {}
620-
host_source = os.path.join(DOCKER_MOUNT_PREFIX, source.lstrip("/"))
621+
host_source = os.path.join(
622+
self._get_docker_host_mount_location(
623+
self.ssh_command_runner.cluster_name), source.lstrip("/"))
621624
self.ssh_command_runner.run(
622625
f"mkdir -p {os.path.dirname(host_source.rstrip('/'))}")
623626
if source[-1] == "/":
@@ -709,7 +712,8 @@ def run_init(self, *, as_head, file_mounts):
709712
self.docker_config.get(
710713
"run_options", []) + self.docker_config.get(
711714
f"{'head' if as_head else 'worker'}_run_options",
712-
[]) + self._configure_runtime())
715+
[]) + self._configure_runtime(),
716+
self.ssh_command_runner.cluster_name)
713717
self.run(start_command, run_env="host")
714718
else:
715719
running_image = self.run(
@@ -746,7 +750,9 @@ def run_init(self, *, as_head, file_mounts):
746750
if mount in file_mounts:
747751
self.ssh_command_runner.run(
748752
"docker cp {src} {container}:{dst}".format(
749-
src=os.path.join(DOCKER_MOUNT_PREFIX, mount),
753+
src=os.path.join(
754+
self._get_docker_host_mount_location(
755+
self.ssh_command_runner.cluster_name), mount),
750756
container=self.container_name,
751757
dst=self._docker_expand_user(mount)))
752758
self.initialized = True
@@ -769,3 +775,9 @@ def _configure_runtime(self):
769775
return []
770776

771777
return []
778+
779+
def _get_docker_host_mount_location(self, cluster_name: str) -> str:
780+
"""Return the docker host mount directory location."""
781+
# Imported here due to circular dependency in imports.
782+
from ray.autoscaler.sdk import get_docker_host_mount_location
783+
return get_docker_host_mount_location(cluster_name)

python/ray/autoscaler/_private/constants.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import os
22

3-
from ray.ray_constants import ( # noqa F401
3+
from ray.ray_constants import ( # noqa F401
44
AUTOSCALER_RESOURCE_REQUEST_CHANNEL, LOGGER_FORMAT,
55
MEMORY_RESOURCE_UNIT_BYTES, RESOURCES_ENVIRONMENT_VARIABLE)
66

@@ -36,6 +36,3 @@ def env_integer(key, default):
3636
BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12)
3737
# Max number of retries to create an EC2 node (retry different subnet)
3838
BOTO_CREATE_MAX_RETRIES = env_integer("BOTO_CREATE_MAX_RETRIES", 5)
39-
40-
# Host path that Docker mounts attach to
41-
DOCKER_MOUNT_PREFIX = "/tmp/ray_tmp_mount"

python/ray/autoscaler/_private/docker.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
except ImportError: # py2
55
from pipes import quote
66

7-
from ray.autoscaler._private.constants import DOCKER_MOUNT_PREFIX
8-
97
logger = logging.getLogger(__name__)
108

119

@@ -66,8 +64,12 @@ def check_docker_image(cname):
6664
return _check_helper(cname, ".Config.Image")
6765

6866

69-
def docker_start_cmds(user, image, mount_dict, cname, user_options):
70-
mount = {f"{DOCKER_MOUNT_PREFIX}/{dst}": dst for dst in mount_dict}
67+
def docker_start_cmds(user, image, mount_dict, container_name, user_options,
68+
cluster_name):
69+
# Imported here due to circular dependency.
70+
from ray.autoscaler.sdk import get_docker_host_mount_location
71+
docker_mount_prefix = get_docker_host_mount_location(cluster_name)
72+
mount = {f"{docker_mount_prefix}/{dst}": dst for dst in mount_dict}
7173

7274
# TODO(ilr) Move away from defaulting to /root/
7375
mount_flags = " ".join([
@@ -82,7 +84,8 @@ def docker_start_cmds(user, image, mount_dict, cname, user_options):
8284

8385
user_options_str = " ".join(user_options)
8486
docker_run = [
85-
"docker", "run", "--rm", "--name {}".format(cname), "-d", "-it",
86-
mount_flags, env_flags, user_options_str, "--net=host", image, "bash"
87+
"docker", "run", "--rm", "--name {}".format(container_name), "-d",
88+
"-it", mount_flags, env_flags, user_options_str, "--net=host", image,
89+
"bash"
8790
]
8891
return " ".join(docker_run)

python/ray/autoscaler/sdk.py

+6
Original file line numberDiff line numberDiff line change
@@ -208,3 +208,9 @@ def fillout_defaults(config: Dict[str, Any]) -> Dict[str, Any]:
208208
"""Fillout default values for a cluster_config based on the provider."""
209209
from ray.autoscaler._private.util import fillout_defaults
210210
return fillout_defaults(config)
211+
212+
213+
def get_docker_host_mount_location(cluster_name: str) -> str:
214+
"""Return host path that Docker mounts attach to."""
215+
docker_mount_prefix = "/tmp/ray_tmp_mount/{cluster_name}"
216+
return docker_mount_prefix.format(cluster_name=cluster_name)

python/ray/tests/test_autoscaler.py

+18-11
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import ray._private.services as services
1616
from ray.autoscaler._private.util import prepare_config, validate_config
1717
from ray.autoscaler._private import commands
18-
from ray.autoscaler._private.docker import DOCKER_MOUNT_PREFIX
18+
from ray.autoscaler.sdk import get_docker_host_mount_location
1919
from ray.autoscaler._private.load_metrics import LoadMetrics
2020
from ray.autoscaler._private.autoscaler import StandardAutoscaler
2121
from ray.autoscaler._private.providers import (_NODE_PROVIDERS,
@@ -502,14 +502,18 @@ def testGetOrCreateHeadNode(self):
502502
runner.assert_has_call("1.2.3.4", "start_ray_head")
503503
self.assertEqual(self.provider.mock_nodes[0].node_type, None)
504504
runner.assert_has_call("1.2.3.4", pattern="docker run")
505+
506+
docker_mount_prefix = get_docker_host_mount_location(
507+
SMALL_CLUSTER["cluster_name"])
505508
runner.assert_not_has_call(
506-
"1.2.3.4", pattern="-v /tmp/ray_tmp_mount/~/ray_bootstrap_config")
507-
runner.assert_has_call(
508509
"1.2.3.4",
509-
pattern="docker cp /tmp/ray_tmp_mount/~/ray_bootstrap_key.pem")
510+
pattern=f"-v {docker_mount_prefix}/~/ray_bootstrap_config")
510511
runner.assert_has_call(
511512
"1.2.3.4",
512-
pattern="docker cp /tmp/ray_tmp_mount/~/ray_bootstrap_config.yaml")
513+
pattern=f"docker cp {docker_mount_prefix}/~/ray_bootstrap_key.pem")
514+
pattern_to_assert = \
515+
f"docker cp {docker_mount_prefix}/~/ray_bootstrap_config.yaml"
516+
runner.assert_has_call("1.2.3.4", pattern=pattern_to_assert)
513517

514518
@unittest.skipIf(sys.platform == "win32", "Failing on Windows.")
515519
def testRsyncCommandWithDocker(self):
@@ -1473,12 +1477,13 @@ def testContinuousFileMounts(self):
14731477
self.waitForNodes(
14741478
2, tag_filters={TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE})
14751479
autoscaler.update()
1476-
1480+
docker_mount_prefix = get_docker_host_mount_location(
1481+
config["cluster_name"])
14771482
for i in [0, 1]:
14781483
runner.assert_has_call(f"172.0.0.{i}", "setup_cmd")
14791484
runner.assert_has_call(
14801485
f"172.0.0.{i}", f"{file_mount_dir}/ [email protected].{i}:"
1481-
f"{DOCKER_MOUNT_PREFIX}/home/test-folder/")
1486+
f"{docker_mount_prefix}/home/test-folder/")
14821487

14831488
runner.clear_history()
14841489

@@ -1498,7 +1503,7 @@ def testContinuousFileMounts(self):
14981503
runner.assert_has_call(
14991504
f"172.0.0.{i}", f"172.0.0.{i}",
15001505
f"{file_mount_dir}/ [email protected].{i}:"
1501-
f"{DOCKER_MOUNT_PREFIX}/home/test-folder/")
1506+
f"{docker_mount_prefix}/home/test-folder/")
15021507

15031508
def testFileMountsNonContinuous(self):
15041509
file_mount_dir = tempfile.mkdtemp()
@@ -1525,13 +1530,15 @@ def testFileMountsNonContinuous(self):
15251530
self.waitForNodes(
15261531
2, tag_filters={TAG_RAY_NODE_STATUS: STATUS_UP_TO_DATE})
15271532
autoscaler.update()
1533+
docker_mount_prefix = get_docker_host_mount_location(
1534+
config["cluster_name"])
15281535

15291536
for i in [0, 1]:
15301537
runner.assert_has_call(f"172.0.0.{i}", "setup_cmd")
15311538
runner.assert_has_call(
15321539
f"172.0.0.{i}", f"172.0.0.{i}",
15331540
f"{file_mount_dir}/ [email protected].{i}:"
1534-
f"{DOCKER_MOUNT_PREFIX}/home/test-folder/")
1541+
f"{docker_mount_prefix}/home/test-folder/")
15351542

15361543
runner.clear_history()
15371544

@@ -1548,7 +1555,7 @@ def testFileMountsNonContinuous(self):
15481555
runner.assert_not_has_call(f"172.0.0.{i}", "setup_cmd")
15491556
runner.assert_not_has_call(
15501557
f"172.0.0.{i}", f"{file_mount_dir}/ [email protected].{i}:"
1551-
f"{DOCKER_MOUNT_PREFIX}/home/test-folder/")
1558+
f"{docker_mount_prefix}/home/test-folder/")
15521559

15531560
# Simulate a second `ray up` call
15541561
from ray.autoscaler._private import util
@@ -1574,7 +1581,7 @@ def testFileMountsNonContinuous(self):
15741581
runner.assert_has_call(
15751582
f"172.0.0.{i}", f"172.0.0.{i}",
15761583
f"{file_mount_dir}/ [email protected].{i}:"
1577-
f"{DOCKER_MOUNT_PREFIX}/home/test-folder/")
1584+
f"{docker_mount_prefix}/home/test-folder/")
15781585

15791586

15801587
if __name__ == "__main__":

python/ray/tests/test_command_runner.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from ray.autoscaler.command_runner import CommandRunnerInterface
88
from ray.autoscaler._private.command_runner import SSHCommandRunner, \
99
DockerCommandRunner, KubernetesCommandRunner, _with_environment_variables
10-
from ray.autoscaler._private.docker import DOCKER_MOUNT_PREFIX
10+
from ray.autoscaler.sdk import get_docker_host_mount_location
1111
from getpass import getuser
1212
import hashlib
1313

@@ -242,11 +242,12 @@ def test_docker_rsync():
242242

243243
local_mount = "/home/ubuntu/base/mount/"
244244
remote_mount = "/root/protected_mount/"
245-
remote_host_mount = f"{DOCKER_MOUNT_PREFIX}{remote_mount}"
245+
docker_mount_prefix = get_docker_host_mount_location(cluster_name)
246+
remote_host_mount = f"{docker_mount_prefix}{remote_mount}"
246247

247248
local_file = "/home/ubuntu/base-file"
248249
remote_file = "/root/protected-file"
249-
remote_host_file = f"{DOCKER_MOUNT_PREFIX}{remote_file}"
250+
remote_host_file = f"{docker_mount_prefix}{remote_file}"
250251

251252
process_runner.respond_to_call("docker inspect -f", ["true"])
252253
cmd_runner.run_rsync_up(

0 commit comments

Comments
 (0)