Skip to content

Commit 5e6b39c

Browse files
andylizfMichaelvll
andauthored
Change added USER_HASH in models name to full digits (8) instead of half (4) to prevent potential hash conflict (#4592)
* test: print * feat: use 8 digits for user hash * test: full user hash * test: the content is shorten due to length limit * feat: convert user hash from hex to base36 * chore: extend support lifetime of relied code * fix: after merge, re-delete extra user hash splition in smoke test * style: format and type hints * Revert "feat: convert user hash from hex to base36" This reverts commit 2f7ca14. * Update tests/unit_tests/test_common_utils.py --------- Co-authored-by: Zhanghao Wu <[email protected]>
1 parent abc81ab commit 5e6b39c

File tree

5 files changed

+39
-38
lines changed

5 files changed

+39
-38
lines changed

sky/backends/backend_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -997,7 +997,7 @@ def write_cluster_config(
997997

998998
# Read the cluster name from the tmp yaml file, to take the backward
999999
# compatbility restortion above into account.
1000-
# TODO: remove this after 2 minor releases, 0.8.0.
1000+
# TODO: remove this after 2 minor releases, 0.10.0.
10011001
yaml_config = common_utils.read_yaml(tmp_yaml_path)
10021002
config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
10031003

sky/utils/common_utils.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828

2929
_USER_HASH_FILE = os.path.expanduser('~/.sky/user_hash')
3030
USER_HASH_LENGTH = 8
31-
USER_HASH_LENGTH_IN_CLUSTER_NAME = 4
3231

3332
# We are using base36 to reduce the length of the hash. 2 chars -> 36^2 = 1296
3433
# possibilities. considering the final cluster name contains the prefix as well,
@@ -182,7 +181,7 @@ def make_cluster_name_on_cloud(display_name: str,
182181
f'on the cloud, we convert it to {cluster_name_on_cloud}.')
183182
user_hash = ''
184183
if add_user_hash:
185-
user_hash = get_user_hash()[:USER_HASH_LENGTH_IN_CLUSTER_NAME]
184+
user_hash = get_user_hash()
186185
user_hash = f'-{user_hash}'
187186
user_hash_length = len(user_hash)
188187

tests/smoke_tests/smoke_tests_utils.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import subprocess
55
import sys
66
import tempfile
7-
from typing import Dict, List, NamedTuple, Optional, Tuple
7+
from typing import Dict, List, NamedTuple, Optional, Sequence
88
import uuid
99

1010
import colorama
@@ -32,7 +32,7 @@
3232

3333
STORAGE_SETUP_COMMANDS = [
3434
'touch ~/tmpfile', 'mkdir -p ~/tmp-workdir',
35-
'touch ~/tmp-workdir/tmp\ file', 'touch ~/tmp-workdir/tmp\ file2',
35+
r'touch ~/tmp-workdir/tmp\ file', r'touch ~/tmp-workdir/tmp\ file2',
3636
'touch ~/tmp-workdir/foo',
3737
'[ ! -e ~/tmp-workdir/circle-link ] && ln -s ~/tmp-workdir/ ~/tmp-workdir/circle-link || true',
3838
'touch ~/.ssh/id_rsa.pub'
@@ -55,7 +55,7 @@
5555
[status.value for status in sky.ManagedJobStatus])
5656

5757

58-
def _statuses_to_str(statuses: List[enum.Enum]):
58+
def _statuses_to_str(statuses: Sequence[enum.Enum]):
5959
"""Convert a list of enums to a string with all the values separated by |."""
6060
assert len(statuses) > 0, 'statuses must not be empty'
6161
if len(statuses) > 1:
@@ -74,8 +74,8 @@ def _statuses_to_str(statuses: List[enum.Enum]):
7474
'fi; '
7575
'current_status=$(sky status {cluster_name} --refresh | '
7676
'awk "/^{cluster_name}/ '
77-
'{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_CLUSTER_STATUSES +
78-
')$/) print \$i}}"); '
77+
r'{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_CLUSTER_STATUSES +
78+
r')$/) print \$i}}"); '
7979
'if [[ "$current_status" =~ {cluster_status} ]]; '
8080
'then echo "Target cluster status {cluster_status} reached."; break; fi; '
8181
'echo "Waiting for cluster status to become {cluster_status}, current status: $current_status"; '
@@ -136,8 +136,8 @@ def get_cmd_wait_until_cluster_is_not_found(cluster_name: str, timeout: int):
136136
'fi; '
137137
'current_status=$(sky queue {cluster_name} | '
138138
'awk "\\$1 == \\"{job_id}\\" '
139-
'{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_JOB_STATUSES +
140-
')$/) print \$i}}"); '
139+
r'{{for (i=1; i<=NF; i++) if (\$i ~ /^(' + _ALL_JOB_STATUSES +
140+
r')$/) print \$i}}"); '
141141
'found=0; ' # Initialize found variable outside the loop
142142
'while read -r line; do ' # Read line by line
143143
' if [[ "$line" =~ {job_status} ]]; then ' # Check each line
@@ -196,7 +196,8 @@ def get_cmd_wait_until_job_status_contains_matching_job_name(
196196

197197

198198
def get_cmd_wait_until_managed_job_status_contains_matching_job_name(
199-
job_name: str, job_status: List[sky.JobStatus], timeout: int):
199+
job_name: str, job_status: Sequence[sky.ManagedJobStatus],
200+
timeout: int):
200201
return _WAIT_UNTIL_MANAGED_JOB_STATUS_CONTAINS_MATCHING_JOB_NAME.format(
201202
job_name=job_name,
202203
job_status=_statuses_to_str(job_status),
@@ -215,7 +216,7 @@ class Test(NamedTuple):
215216
# Timeout for each command in seconds.
216217
timeout: int = DEFAULT_CMD_TIMEOUT
217218
# Environment variables to set for each command.
218-
env: Dict[str, str] = None
219+
env: Optional[Dict[str, str]] = None
219220

220221
def echo(self, message: str):
221222
# pytest's xdist plugin captures stdout; print to stderr so that the
@@ -254,7 +255,7 @@ def terminate_gcp_replica(name: str, zone: str, replica_id: int) -> str:
254255
f' --quiet $({query_cmd})')
255256

256257

257-
def run_one_test(test: Test) -> Tuple[int, str, str]:
258+
def run_one_test(test: Test) -> None:
258259
# Fail fast if `sky` CLI somehow errors out.
259260
subprocess.run(['sky', 'status'], stdout=subprocess.DEVNULL, check=True)
260261
log_to_stdout = os.environ.get('LOG_TO_STDOUT', None)

tests/smoke_tests/test_managed_job.py

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def test_managed_jobs_basic(generic_cloud: str):
7777
# Test the functionality for logging.
7878
f's=$(sky jobs logs -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "start counting"',
7979
f's=$(sky jobs logs --controller -n {name}-2 --no-follow); echo "$s"; echo "$s" | grep "Cluster launched:"',
80-
f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "RUNNING\|SUCCEEDED"',
80+
rf'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name}-2 | head -n1 | grep "RUNNING\|SUCCEEDED"',
8181
],
8282
# TODO(zhwu): Change to f'sky jobs cancel -y -n {name}-1 -n {name}-2' when
8383
# canceling multiple job names is supported.
@@ -105,19 +105,19 @@ def test_job_pipeline(generic_cloud: str):
105105
[
106106
f'sky jobs launch -n {name} tests/test_yamls/pipeline.yaml --cloud {generic_cloud} -y -d',
107107
'sleep 5',
108-
f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING\|RUNNING"',
108+
rf'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "STARTING\|RUNNING"',
109109
# `grep -A 4 {name}` finds the job with {name} and the 4 lines
110110
# after it, i.e. the 4 tasks within the job.
111111
# `sed -n 2p` gets the second line of the 4 lines, i.e. the first
112112
# task within the job.
113-
f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "STARTING\|RUNNING"',
113+
rf'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "STARTING\|RUNNING"',
114114
f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "PENDING"',
115115
f'sky jobs cancel -y -n {name}',
116116
'sleep 5',
117-
f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLING\|CANCELLED"',
118-
f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLING\|CANCELLED"',
119-
f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLING\|CANCELLED"',
120-
f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLING\|CANCELLED"',
117+
rf'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLING\|CANCELLED"',
118+
rf'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLING\|CANCELLED"',
119+
rf'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 4p | grep "CANCELLING\|CANCELLED"',
120+
rf'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 5p | grep "CANCELLING\|CANCELLED"',
121121
'sleep 200',
122122
f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 2p | grep "CANCELLED"',
123123
f'{smoke_tests_utils.GET_JOB_QUEUE} | grep -A 4 {name}| sed -n 3p | grep "CANCELLED"',
@@ -212,7 +212,7 @@ def test_managed_jobs_recovery_aws(aws_config_region):
212212
test = smoke_tests_utils.Test(
213213
'managed_jobs_recovery_aws',
214214
[
215-
f'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d',
215+
rf'sky jobs launch --cloud aws --region {region} --use-spot -n {name} "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d',
216216
smoke_tests_utils.
217217
get_cmd_wait_until_managed_job_status_contains_matching_job_name(
218218
job_name=name,
@@ -258,7 +258,7 @@ def test_managed_jobs_recovery_gcp():
258258
test = smoke_tests_utils.Test(
259259
'managed_jobs_recovery_gcp',
260260
[
261-
f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d',
261+
rf'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --cpus 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d',
262262
smoke_tests_utils.
263263
get_cmd_wait_until_managed_job_status_contains_matching_job_name(
264264
job_name=name,
@@ -288,14 +288,13 @@ def test_managed_jobs_pipeline_recovery_aws(aws_config_region):
288288
"""Test managed job recovery for a pipeline."""
289289
name = smoke_tests_utils.get_cluster_name()
290290
user_hash = common_utils.get_user_hash()
291-
user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME]
292291
region = aws_config_region
293292
if region != 'us-east-2':
294293
pytest.skip('Only run spot pipeline recovery test in us-east-2')
295294
test = smoke_tests_utils.Test(
296295
'managed_jobs_pipeline_recovery_aws',
297296
[
298-
f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml -y -d',
297+
f'sky jobs launch -n {name} tests/test_yamls/pipeline_aws.yaml -y -d',
299298
smoke_tests_utils.
300299
get_cmd_wait_until_managed_job_status_contains_matching_job_name(
301300
job_name=name,
@@ -342,7 +341,6 @@ def test_managed_jobs_pipeline_recovery_gcp():
342341
name = smoke_tests_utils.get_cluster_name()
343342
zone = 'us-east4-b'
344343
user_hash = common_utils.get_user_hash()
345-
user_hash = user_hash[:common_utils.USER_HASH_LENGTH_IN_CLUSTER_NAME]
346344
query_cmd = (
347345
'gcloud compute instances list --filter='
348346
f'"(labels.ray-cluster-name:*-${{MANAGED_JOB_ID}}-{user_hash})" '
@@ -352,7 +350,7 @@ def test_managed_jobs_pipeline_recovery_gcp():
352350
test = smoke_tests_utils.Test(
353351
'managed_jobs_pipeline_recovery_gcp',
354352
[
355-
f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml -y -d',
353+
f'sky jobs launch -n {name} tests/test_yamls/pipeline_gcp.yaml -y -d',
356354
smoke_tests_utils.
357355
get_cmd_wait_until_managed_job_status_contains_matching_job_name(
358356
job_name=name,
@@ -426,7 +424,7 @@ def test_managed_jobs_recovery_multi_node_aws(aws_config_region):
426424
test = smoke_tests_utils.Test(
427425
'managed_jobs_recovery_multi_node_aws',
428426
[
429-
f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d',
427+
rf'sky jobs launch --cloud aws --region {region} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d',
430428
smoke_tests_utils.
431429
get_cmd_wait_until_managed_job_status_contains_matching_job_name(
432430
job_name=name,
@@ -473,7 +471,7 @@ def test_managed_jobs_recovery_multi_node_gcp():
473471
test = smoke_tests_utils.Test(
474472
'managed_jobs_recovery_multi_node_gcp',
475473
[
476-
f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d',
474+
rf'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot --num-nodes 2 "echo SKYPILOT_TASK_ID: \$SKYPILOT_TASK_ID; sleep 1800" -y -d',
477475
smoke_tests_utils.
478476
get_cmd_wait_until_managed_job_status_contains_matching_job_name(
479477
job_name=name,
@@ -512,7 +510,7 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
512510
'managed_jobs_cancellation_aws',
513511
[
514512
# Test cancellation during spot cluster being launched.
515-
f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000" -y -d',
513+
f'sky jobs launch --cloud aws --region {region} -n {name} --use-spot "sleep 1000" -y -d',
516514
smoke_tests_utils.
517515
get_cmd_wait_until_managed_job_status_contains_matching_job_name(
518516
job_name=name,
@@ -532,7 +530,7 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
532530
'--output text) && echo "$s" && echo; [[ -z "$s" ]] || [[ "$s" = "terminated" ]] || [[ "$s" = "shutting-down" ]]'
533531
),
534532
# Test cancelling the spot cluster during spot job being setup.
535-
f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d',
533+
f'sky jobs launch --cloud aws --region {region} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d',
536534
# The job is set up in the cluster, will shown as RUNNING.
537535
smoke_tests_utils.
538536
get_cmd_wait_until_managed_job_status_contains_matching_job_name(
@@ -551,7 +549,7 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
551549
'--output text) && echo "$s" && echo; [[ -z "$s" ]] || [[ "$s" = "terminated" ]] || [[ "$s" = "shutting-down" ]]'
552550
),
553551
# Test cancellation during spot job is recovering.
554-
f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000" -y -d',
552+
f'sky jobs launch --cloud aws --region {region} -n {name}-3 --use-spot "sleep 1000" -y -d',
555553
# The job is running in the cluster, will shown as RUNNING.
556554
smoke_tests_utils.
557555
get_cmd_wait_until_managed_job_status_contains_matching_job_name(
@@ -605,7 +603,7 @@ def test_managed_jobs_cancellation_gcp():
605603
'managed_jobs_cancellation_gcp',
606604
[
607605
# Test cancellation during spot cluster being launched.
608-
f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000" -y -d',
606+
f'sky jobs launch --cloud gcp --zone {zone} -n {name} --use-spot "sleep 1000" -y -d',
609607
smoke_tests_utils.
610608
get_cmd_wait_until_managed_job_status_contains_matching_job_name(
611609
job_name=name,
@@ -618,7 +616,7 @@ def test_managed_jobs_cancellation_gcp():
618616
job_status=[sky.ManagedJobStatus.CANCELLED],
619617
timeout=155),
620618
# Test cancelling the spot cluster during spot job being setup.
621-
f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d',
619+
f'sky jobs launch --cloud gcp --zone {zone} -n {name}-2 --use-spot tests/test_yamls/test_long_setup.yaml -y -d',
622620
# The job is set up in the cluster, will shown as RUNNING.
623621
smoke_tests_utils.
624622
get_cmd_wait_until_managed_job_status_contains_matching_job_name(
@@ -632,7 +630,7 @@ def test_managed_jobs_cancellation_gcp():
632630
job_status=[sky.ManagedJobStatus.CANCELLED],
633631
timeout=155),
634632
# Test cancellation during spot job is recovering.
635-
f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000" -y -d',
633+
f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000" -y -d',
636634
smoke_tests_utils.
637635
get_cmd_wait_until_managed_job_status_contains_matching_job_name(
638636
job_name=f'{name}-3',
@@ -885,7 +883,7 @@ def test_managed_jobs_inline_env(generic_cloud: str):
885883
test = smoke_tests_utils.Test(
886884
'test-managed-jobs-inline-env',
887885
[
888-
f'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "echo "\\$TEST_ENV"; ([[ ! -z \\"\$TEST_ENV\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_IPS}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NODE_RANK}\\" ]] && [[ ! -z \\"\${constants.SKYPILOT_NUM_NODES}\\" ]]) || exit 1"',
886+
rf'sky jobs launch -n {name} -y --cloud {generic_cloud} --env TEST_ENV="hello world" -- "echo "\$TEST_ENV"; ([[ ! -z \"\$TEST_ENV\" ]] && [[ ! -z \"\${constants.SKYPILOT_NODE_IPS}\" ]] && [[ ! -z \"\${constants.SKYPILOT_NODE_RANK}\" ]] && [[ ! -z \"\${constants.SKYPILOT_NUM_NODES}\" ]]) || exit 1"',
889887
smoke_tests_utils.
890888
get_cmd_wait_until_managed_job_status_contains_matching_job_name(
891889
job_name=name,

tests/unit_tests/test_common_utils.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,16 +36,19 @@ class TestMakeClusterNameOnCloud:
3636
@mock.patch('sky.utils.common_utils.get_user_hash')
3737
def test_make(self, mock_get_user_hash):
3838
mock_get_user_hash.return_value = MOCKED_USER_HASH
39-
assert "lora-ab12" == common_utils.make_cluster_name_on_cloud("lora")
39+
assert "lora-ab12cd34" == common_utils.make_cluster_name_on_cloud(
40+
"lora")
4041

4142
@mock.patch('sky.utils.common_utils.get_user_hash')
4243
def test_make_with_hyphen(self, mock_get_user_hash):
4344
mock_get_user_hash.return_value = MOCKED_USER_HASH
44-
assert "seed-1-ab12" == common_utils.make_cluster_name_on_cloud(
45+
assert "seed-1-ab12cd34" == common_utils.make_cluster_name_on_cloud(
4546
"seed-1")
4647

4748
@mock.patch('sky.utils.common_utils.get_user_hash')
4849
def test_make_with_characters_to_transform(self, mock_get_user_hash):
4950
mock_get_user_hash.return_value = MOCKED_USER_HASH
50-
assert "cuda-11-8-ab12" == common_utils.make_cluster_name_on_cloud(
51+
assert "cud-73-ab12cd34" == common_utils.make_cluster_name_on_cloud(
5152
"Cuda_11.8")
53+
assert "cuda-11-8-ab12cd34" == common_utils.make_cluster_name_on_cloud(
54+
"Cuda_11.8", max_length=20)

0 commit comments

Comments
 (0)