Skip to content

Commit 4311242

Browse files
authored
[release test] collapse into AnyscaleJobRunner in glue.py (#60267)
it is always an instance of AnyscaleJobRunner. Signed-off-by: Lonnie Liu <95255098+aslonnie@users.noreply.github.com>
1 parent 55b2d08 commit 4311242

File tree

1 file changed

+32
-37
lines changed

1 file changed

+32
-37
lines changed

release/ray_release/glue.py

Lines changed: 32 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from ray_release.cluster_manager.cluster_manager import ClusterManager
1818
from ray_release.cluster_manager.minimal import MinimalClusterManager
1919
from ray_release.command_runner.anyscale_job_runner import AnyscaleJobRunner
20-
from ray_release.command_runner.command_runner import CommandRunner
2120
from ray_release.config import (
2221
DEFAULT_AUTOSUSPEND_MINS,
2322
DEFAULT_BUILD_TIMEOUT,
@@ -80,7 +79,7 @@ def _load_test_configuration(
8079
anyscale_project: str,
8180
result: Result,
8281
smoke_test: bool = False,
83-
) -> Tuple[ClusterManager, CommandRunner, str]:
82+
) -> Tuple[ClusterManager, AnyscaleJobRunner, str]:
8483
logger.info(f"Test config: {test}")
8584

8685
# Populate result paramaters
@@ -136,6 +135,9 @@ def _load_test_configuration(
136135
except Exception as e:
137136
raise ReleaseTestSetupError(f"Error setting up release test: {e}") from e
138137

138+
if not isinstance(command_runner, AnyscaleJobRunner):
139+
raise ReleaseTestSetupError("Command runner is not an AnyscaleJobRunner")
140+
139141
return cluster_manager, command_runner, artifact_path
140142

141143

@@ -221,32 +223,27 @@ def _setup_cluster_environment(
221223
return prepare_cmd, prepare_timeout, build_timeout, cluster_timeout, command_timeout
222224

223225

224-
def _local_environment_information(
225-
result: Result,
226+
def _build_local_environment_information(
226227
cluster_manager: ClusterManager,
227-
command_runner: CommandRunner,
228+
runner: AnyscaleJobRunner,
228229
build_timeout: int,
229230
cluster_timeout: int,
230231
cluster_env_id: Optional[str],
231232
) -> None:
232233
# Start cluster
233234
buildkite_group(":gear: Building cluster environment")
234-
235235
cluster_manager.cluster_env_id = cluster_env_id
236-
237236
cluster_manager.build_configs(timeout=build_timeout)
238-
239-
if isinstance(command_runner, AnyscaleJobRunner):
240-
command_runner.job_manager.cluster_startup_timeout = cluster_timeout
237+
runner.job_manager.cluster_startup_timeout = cluster_timeout
241238

242239

243240
def _prepare_remote_environment(
244241
test: Test,
245-
command_runner: CommandRunner,
242+
runner: AnyscaleJobRunner,
246243
prepare_cmd: bool,
247244
prepare_timeout: int,
248245
) -> None:
249-
command_runner.prepare_remote_env()
246+
runner.prepare_remote_env()
250247

251248
wait_for_nodes = test["run"].get("wait_for_nodes", None)
252249

@@ -257,11 +254,11 @@ def _prepare_remote_environment(
257254
wait_for_nodes.get("timeout", DEFAULT_WAIT_FOR_NODES_TIMEOUT)
258255
)
259256
num_nodes = test["run"]["wait_for_nodes"]["num_nodes"]
260-
command_runner.wait_for_nodes(num_nodes, wait_timeout)
257+
runner.wait_for_nodes(num_nodes, wait_timeout)
261258

262259
if prepare_cmd:
263260
try:
264-
command_runner.run_prepare_command(prepare_cmd, timeout=prepare_timeout)
261+
runner.run_prepare_command(prepare_cmd, timeout=prepare_timeout)
265262
except CommandError as e:
266263
raise PrepareCommandError(e)
267264
except CommandTimeout as e:
@@ -293,7 +290,7 @@ def _upload_working_dir_to_gcs(working_dir: str) -> str:
293290
def _running_test_script(
294291
test: Test,
295292
smoke_test: bool,
296-
command_runner: CommandRunner,
293+
runner: AnyscaleJobRunner,
297294
command_timeout: int,
298295
) -> None:
299296
command = test["run"]["script"]
@@ -306,7 +303,7 @@ def _running_test_script(
306303
is_long_running = test["run"].get("long_running", False)
307304

308305
try:
309-
command_runner.run_command(
306+
runner.run_command(
310307
command,
311308
env=command_env,
312309
timeout=command_timeout,
@@ -329,22 +326,22 @@ def _running_test_script(
329326

330327
def _fetching_results(
331328
result: Result,
332-
command_runner: CommandRunner,
329+
runner: AnyscaleJobRunner,
333330
artifact_path: Optional[str],
334331
smoke_test: bool,
335332
start_time_unix: int,
336333
) -> Tuple[dict, Exception]:
337334
fetch_result_exception = None
338335
try:
339-
command_results = command_runner.fetch_results()
336+
command_results = runner.fetch_results()
340337
except Exception as e:
341338
logger.exception(f"Could not fetch results for test command: {e}")
342339
command_results = {}
343340
fetch_result_exception = e
344341

345342
if artifact_path:
346343
try:
347-
command_runner.fetch_artifact()
344+
runner.fetch_artifact()
348345
except Exception as e:
349346
logger.error("Could not fetch artifact for test command")
350347
logger.exception(e)
@@ -356,7 +353,7 @@ def _fetching_results(
356353
)
357354

358355
try:
359-
metrics = command_runner.fetch_metrics()
356+
metrics = runner.fetch_metrics()
360357
except Exception as e:
361358
logger.exception(f"Could not fetch metrics for test command: {e}")
362359
metrics = {}
@@ -462,15 +459,15 @@ def run_release_test_anyscale(
462459
) -> Result:
463460
old_wd = os.getcwd()
464461
start_time = time.monotonic()
465-
command_runner = None
462+
runner = None
466463
cluster_manager = None
467464
pipeline_exception = None
468465
# non critical for some tests. So separate it from the general one.
469466
fetch_result_exception = None
470467
try:
471468

472469
buildkite_group(":spiral_note_pad: Loading test configuration")
473-
cluster_manager, command_runner, artifact_path = _load_test_configuration(
470+
cluster_manager, runner, artifact_path = _load_test_configuration(
474471
test,
475472
anyscale_project,
476473
result,
@@ -502,11 +499,10 @@ def run_release_test_anyscale(
502499
test_definition_root,
503500
)
504501

505-
buildkite_group(":bulb: Local environment information")
506-
_local_environment_information(
507-
result,
502+
buildkite_group(":bulb: Building local environment information")
503+
_build_local_environment_information(
508504
cluster_manager,
509-
command_runner,
505+
runner,
510506
build_timeout,
511507
cluster_timeout,
512508
cluster_env_id,
@@ -516,7 +512,7 @@ def run_release_test_anyscale(
516512
buildkite_group(":wrench: Preparing remote environment")
517513
_prepare_remote_environment(
518514
test,
519-
command_runner,
515+
runner,
520516
prepare_cmd,
521517
prepare_timeout,
522518
)
@@ -526,32 +522,31 @@ def run_release_test_anyscale(
526522
_running_test_script(
527523
test,
528524
smoke_test,
529-
command_runner,
525+
runner,
530526
command_timeout,
531527
)
532528

533529
buildkite_group(":floppy_disk: Fetching results")
534530
metrics, fetch_result_exception = _fetching_results(
535531
result,
536-
command_runner,
532+
runner,
537533
artifact_path,
538534
smoke_test,
539535
start_time_unix,
540536
)
537+
538+
# Obtain the cluster info again as it is set after the
539+
# command was run in case of anyscale jobs
540+
result.job_url = runner.job_manager.job_url
541+
result.job_id = runner.job_manager.job_id
542+
result.last_logs = runner.get_last_logs()
543+
541544
except Exception as e:
542545
logger.exception(e)
543546
buildkite_open_last()
544547
pipeline_exception = e
545548
metrics = {}
546549

547-
# Obtain the cluster info again as it is set after the
548-
# command was run in case of anyscale jobs
549-
if isinstance(command_runner, AnyscaleJobRunner):
550-
result.job_url = command_runner.job_manager.job_url
551-
result.job_id = command_runner.job_manager.job_id
552-
553-
result.last_logs = command_runner.get_last_logs() if command_runner else None
554-
555550
reset_signal_handling()
556551

557552
time_taken = time.monotonic() - start_time

0 commit comments

Comments
 (0)