Skip to content

Commit 81b0b1d

Browse files
khluuCopilot
andauthored
[release][ci] First test for kuberay release test trigger path (ray-project#54415)
- Modify Kuberay release test trigger code to catch exception and store it in `Result` - Modify glue unit test to include Kuberay variant of the release test trigger path --------- Signed-off-by: kevin <kevin@anyscale.com> Signed-off-by: Kevin H. Luu <kevin@anyscale.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 915a275 commit 81b0b1d

File tree

2 files changed

+108
-45
lines changed

2 files changed

+108
-45
lines changed

release/ray_release/glue.py

Lines changed: 54 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -421,42 +421,61 @@ def run_release_test_kuberay(
421421
smoke_test: bool = False,
422422
test_definition_root: Optional[str] = None,
423423
) -> Result:
424-
result.stable = test.get("stable", True)
425-
result.smoke_test = smoke_test
426-
cluster_compute = load_test_cluster_compute(test, test_definition_root)
427-
kuberay_compute_config = convert_cluster_compute_to_kuberay_compute_config(
428-
cluster_compute
429-
)
430-
kuberay_autoscaler_version = cluster_compute.get("autoscaler_version", None)
431-
if kuberay_autoscaler_version:
432-
kuberay_autoscaler_config = {"version": kuberay_autoscaler_version}
433-
else:
434-
kuberay_autoscaler_config = None
435-
working_dir_upload_path = upload_working_dir(get_working_dir(test))
424+
start_time = time.monotonic()
425+
pipeline_exception = None
426+
try:
427+
result.stable = test.get("stable", True)
428+
result.smoke_test = smoke_test
429+
cluster_compute = load_test_cluster_compute(test, test_definition_root)
430+
kuberay_compute_config = convert_cluster_compute_to_kuberay_compute_config(
431+
cluster_compute
432+
)
433+
kuberay_autoscaler_version = cluster_compute.get("autoscaler_version", None)
434+
if kuberay_autoscaler_version:
435+
kuberay_autoscaler_config = {"version": kuberay_autoscaler_version}
436+
else:
437+
kuberay_autoscaler_config = None
438+
working_dir_upload_path = upload_working_dir(get_working_dir(test))
439+
440+
command_timeout = int(test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT))
441+
test_name_hash = hashlib.sha256(test["name"].encode()).hexdigest()[:10]
442+
# random 8 digit suffix
443+
random_suffix = "".join(random.choices(string.digits, k=8))
444+
base_job_name = f"{test['name'][:20]}-{test_name_hash}-{random_suffix}"
445+
job_name = base_job_name.replace("_", "-")
446+
logger.info(f"Job name: {job_name}")
447+
kuberay_job_manager = KubeRayJobManager()
448+
retcode, duration = kuberay_job_manager.run_and_wait(
449+
job_name=job_name,
450+
image=test.get_anyscale_byod_image(),
451+
cmd_to_run=test["run"]["script"],
452+
env_vars=test.get_byod_runtime_env(),
453+
working_dir=working_dir_upload_path,
454+
pip=test.get_byod_pips(),
455+
compute_config=kuberay_compute_config,
456+
autoscaler_config=kuberay_autoscaler_config,
457+
timeout=command_timeout,
458+
)
459+
kuberay_job_manager.fetch_results()
460+
result.return_code = retcode
461+
result.runtime = duration
462+
except Exception as e:
463+
logger.info(f"Exception: {e}")
464+
pipeline_exception = e
465+
result.runtime = time.monotonic() - start_time
436466

437-
command_timeout = int(test["run"].get("timeout", DEFAULT_COMMAND_TIMEOUT))
438-
test_name_hash = hashlib.sha256(test["name"].encode()).hexdigest()[:10]
439-
# random 8 digit suffix
440-
random_suffix = "".join(random.choices(string.digits, k=8))
441-
job_name = f"{test['name'][:20]}-{test_name_hash}-{random_suffix}".replace("_", "-")
442-
logger.info(f"Job name: {job_name}")
443-
logger.info(f"KubeRay compute config: {kuberay_compute_config}")
444-
logger.info(f"KubeRay autoscaler config: {kuberay_autoscaler_config}")
445-
kuberay_job_manager = KubeRayJobManager()
446-
retcode, duration = kuberay_job_manager.run_and_wait(
447-
job_name=job_name,
448-
image=test.get_anyscale_byod_image(),
449-
cmd_to_run=test["run"]["script"],
450-
env_vars=test.get_byod_runtime_env(),
451-
working_dir=working_dir_upload_path,
452-
pip=test.get_byod_pips(),
453-
compute_config=kuberay_compute_config,
454-
autoscaler_config=kuberay_autoscaler_config,
455-
timeout=command_timeout,
456-
)
457-
kuberay_job_manager.fetch_results()
458-
result.return_code = retcode
459-
result.runtime = duration
467+
if pipeline_exception:
468+
buildkite_group(":rotating_light: Handling errors")
469+
exit_code, result_status, runtime = handle_exception(
470+
pipeline_exception,
471+
result.runtime,
472+
)
473+
474+
result.return_code = exit_code.value
475+
result.status = result_status.value
476+
if runtime is not None:
477+
result.runtime = runtime
478+
raise pipeline_exception
460479
return result
461480

462481

release/ray_release/tests/test_glue.py

Lines changed: 54 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,9 @@ def setUp(self) -> None:
8787
self.sdk.returns["get_cloud"] = APIDict(result=APIDict(provider="AWS"))
8888

8989
self.writeClusterEnv("{'env': true}")
90-
self.writeClusterCompute("{'compute': true}")
90+
self.writeClusterCompute(
91+
"{'head_node_type': {'name': 'head_node', 'instance_type': 'm5a.4xlarge'}, 'worker_node_types': []}"
92+
)
9193

9294
with open(os.path.join(self.tempdir, "driver_fail.sh"), "wt") as f:
9395
f.write("exit 1\n")
@@ -175,6 +177,23 @@ def mock_alerter(test: Test, result: Result):
175177
),
176178
alert="unit_test_alerter",
177179
)
180+
self.kuberay_test = MockTest(
181+
name="unit_test_end_to_end_kuberay",
182+
run=dict(
183+
type="unit_test",
184+
prepare="prepare_cmd",
185+
script="test_cmd",
186+
wait_for_nodes=dict(num_nodes=4, timeout=40),
187+
),
188+
working_dir=self.tempdir,
189+
cluster=dict(
190+
cluster_env="cluster_env.yaml",
191+
cluster_compute="cluster_compute.yaml",
192+
byod={},
193+
),
194+
env="kuberay",
195+
alert="unit_test_alerter",
196+
)
178197
self.anyscale_project = "prj_unit12345678"
179198

180199
def tearDown(self) -> None:
@@ -237,42 +256,67 @@ def _succeed_until(self, until: str):
237256

238257
self.mock_alert_return = None
239258

240-
def _run(self, result: Result, **kwargs):
241-
run_release_test(
242-
test=self.test,
243-
anyscale_project=self.anyscale_project,
244-
result=result,
245-
log_streaming_limit=1000,
246-
**kwargs
247-
)
259+
def _run(self, result: Result, kuberay: bool = False, **kwargs):
260+
if kuberay:
261+
run_release_test(
262+
test=self.kuberay_test,
263+
result=result,
264+
log_streaming_limit=1000,
265+
**kwargs
266+
)
267+
else:
268+
run_release_test(
269+
test=self.test,
270+
anyscale_project=self.anyscale_project,
271+
result=result,
272+
log_streaming_limit=1000,
273+
**kwargs
274+
)
248275

249276
def testInvalidClusterCompute(self):
250277
result = Result()
251278

279+
# Test with regular run
252280
with patch(
253281
"ray_release.glue.load_test_cluster_compute",
254282
_fail_on_call(ReleaseTestConfigError),
255283
), self.assertRaises(ReleaseTestConfigError):
256284
self._run(result)
257285
self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value)
258286

287+
# Test with kuberay run
288+
with patch(
289+
"ray_release.glue.load_test_cluster_compute",
290+
_fail_on_call(ReleaseTestConfigError),
291+
), self.assertRaises(ReleaseTestConfigError):
292+
self._run(result, True)
293+
self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value)
294+
259295
# Fails because file not found
260296
os.unlink(os.path.join(self.tempdir, "cluster_compute.yaml"))
261297
with self.assertRaisesRegex(ReleaseTestConfigError, "Path not found"):
262298
self._run(result)
263299
self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value)
300+
with self.assertRaisesRegex(ReleaseTestConfigError, "Path not found"):
301+
self._run(result, True)
302+
self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value)
264303

265304
# Fails because invalid jinja template
266305
self.writeClusterCompute("{{ INVALID")
267306
with self.assertRaisesRegex(ReleaseTestConfigError, "yaml template"):
268307
self._run(result)
269308
self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value)
309+
with self.assertRaisesRegex(ReleaseTestConfigError, "yaml template"):
310+
self._run(result, True)
311+
self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value)
270312

271313
# Fails because invalid json
272314
self.writeClusterCompute("{'test': true, 'fail}")
273315
with self.assertRaisesRegex(ReleaseTestConfigError, "quoted scalar"):
274316
self._run(result)
275-
317+
self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value)
318+
with self.assertRaisesRegex(ReleaseTestConfigError, "quoted scalar"):
319+
self._run(result, True)
276320
self.assertEqual(result.return_code, ExitCode.CONFIG_ERROR.value)
277321

278322
def testStartClusterFails(self):

0 commit comments

Comments
 (0)