ray-project · aslonnie · Feb 7, 2026 · Feb 7, 2026 · Feb 7, 2026 · Feb 7, 2026
@@ -101,7 +101,6 @@ def create_cluster_env_from_image(
                     config_json=dict(
                         docker_image=image,
                         ray_version="nightly",
-                        env_vars=runtime_env,
                     ),
                 )
             )

@@ -54,10 +54,7 @@ def set_cluster_env(self):
             .replace(":", "_")
             .replace(".", "_")
         )
-        self.cluster_env_name = (
-            f"{byod_image_name_normalized}"
-            f"__env__{dict_hash(self.test.get_byod_runtime_env())}"
-        )
+        self.cluster_env_name = byod_image_name_normalized
 
     def set_cluster_compute(
         self,

@@ -18,8 +18,6 @@
     JobBrokenError,
     JobNoLogsError,
     JobOutOfRetriesError,
-    JobTerminatedBeforeStartError,
-    JobTerminatedError,
     LogsError,
     PrepareCommandError,
     PrepareCommandTimeout,
@@ -158,26 +156,15 @@ def wait_for_nodes(self, num_nodes: int, timeout: float = 900):
             f"python wait_cluster.py {num_nodes} {timeout}", timeout=timeout + 30
         )
 
-    def _handle_command_output(
-        self, job_status_code: int, error: str, raise_on_timeout: bool = True
-    ):
-        if job_status_code == -2:
-            raise JobBrokenError(f"Job state is 'BROKEN' with error:\n{error}\n")
-
-        if job_status_code == -3:
-            raise JobTerminatedError(
-                "Job entered 'TERMINATED' state (it was terminated "
-                "manually or Ray was stopped):"
-                f"\n{error}\n"
+    def _handle_command_output(self, job_state: int, raise_on_timeout: bool = True):
+        if job_state == -1:
+            raise JobOutOfRetriesError(
+                "Job returned non-success state: 'FAILED' "
+                "(command has not been ran or no logs could have been obtained)."
             )
 
-        if job_status_code == -4:
-            raise JobTerminatedBeforeStartError(
-                "Job entered 'TERMINATED' state before it started "
-                "(most likely due to inability to provision required nodes; "
-                "otherwise it was terminated manually or Ray was stopped):"
-                f"\n{error}\n"
-            )
+        if job_state == -2:
+            raise JobBrokenError("Job state is 'UNKNOWN'.")
 
         # First try to obtain the output.json from S3.
         # If that fails, try logs.
@@ -214,8 +201,7 @@ def _handle_command_output(
                     )
                 raise PrepareCommandError(
                     f"Prepare command '{self.prepare_commands[-1]}' returned "
-                    f"non-success status: {prepare_return_codes[-1]} with error:"
-                    f"\n{error}\n"
+                    f"non-success status: {prepare_return_codes[-1]}."
                 )
         else:
             raise JobNoLogsError("Could not obtain logs for the job.")
@@ -231,15 +217,7 @@ def _handle_command_output(
 
         if workload_status_code is not None and workload_status_code != 0:
             raise TestCommandError(
-                f"Command returned non-success status: {workload_status_code} with "
-                f"error:\n{error}\n"
-            )
-
-        if job_status_code == -1:
-            raise JobOutOfRetriesError(
-                "Job returned non-success state: 'OUT_OF_RETRIES' "
-                "(command has not been ran or no logs could have been obtained) "
-                f"with error:\n{error}\n"
+                f"Command returned non-success status: {workload_status_code}."
             )
 
     def _get_full_command_env(self, env: Optional[Dict[str, str]] = None):
@@ -348,18 +326,14 @@ def run_command(
             working_dir = azure_file_path
             logger.info(f"Working dir uploaded to {working_dir}")
 
-        job_status_code, time_taken = self.job_manager.run_and_wait(
+        job_state, time_taken = self.job_manager.run_and_wait(
             full_command,
             full_env,
             working_dir=working_dir,
             upload_path=self.upload_path,
             timeout=int(timeout),
         )
-        error_message = self.job_manager.job_error_message()
-
-        self._handle_command_output(
-            job_status_code, error_message, raise_on_timeout=raise_on_timeout
-        )
+        self._handle_command_output(job_state, raise_on_timeout=raise_on_timeout)
 
         return time_taken
 

@@ -23,26 +23,31 @@ def generate_custom_build_step_key(image: str) -> str:
 
 def get_images_from_tests(
     tests: List[Test], build_id: str
-) -> Tuple[List[Tuple[str, str, str, str]], Dict[str, List[str]]]:
+) -> Tuple[
+    List[Tuple[str, str, Optional[str], Optional[str], Optional[Dict[str, str]]]],
+    Dict[str, List[str]],
+]:
     """Get a list of custom BYOD images to build from a list of tests."""
-    custom_byod_images = set()
+    custom_byod_images = {}
     custom_image_test_names_map = {}
     for test in tests:
         if not test.require_custom_byod_image():
             continue
-        custom_byod_image_build = (
-            test.get_anyscale_byod_image(build_id),
-            test.get_anyscale_base_byod_image(build_id),
-            test.get_byod_post_build_script(),
-            test.get_byod_python_depset(),
-        )
-        custom_byod_images.add(custom_byod_image_build)
-        image_tag = custom_byod_image_build[0]
+        image_tag = test.get_anyscale_byod_image(build_id)
+        if image_tag not in custom_byod_images:
+            runtime_env = test.get_byod_runtime_env() or None
+            custom_byod_images[image_tag] = (
+                image_tag,
+                test.get_anyscale_base_byod_image(build_id),
+                test.get_byod_post_build_script(),
+                test.get_byod_python_depset(),
+                runtime_env,
+            )
         logger.info(f"To be built: {image_tag}")
         if image_tag not in custom_image_test_names_map:
             custom_image_test_names_map[image_tag] = []
         custom_image_test_names_map[image_tag].append(test.get_name())
-    return list(custom_byod_images), custom_image_test_names_map
+    return list(custom_byod_images.values()), custom_image_test_names_map
 
 
 def create_custom_build_yaml(destination_file: str, tests: List[Test]) -> None:
@@ -57,14 +62,38 @@ def create_custom_build_yaml(destination_file: str, tests: List[Test]) -> None:
         return
     build_config = {"group": "Custom images build", "steps": []}
     ray_want_commit = os.getenv("RAY_WANT_COMMIT_IN_IMAGE", "")
-    for image, base_image, post_build_script, python_depset in custom_byod_images:
+    for (
+        image,
+        base_image,
+        post_build_script,
+        python_depset,
+        runtime_env,
+    ) in custom_byod_images:
         logger.info(
-            f"Building custom BYOD image: {image}, base image: {base_image}, post build script: {post_build_script}"
+            f"Building custom BYOD image: {image}, base image: {base_image}, "
+            f"post build script: {post_build_script}, runtime_env: {runtime_env}"
         )
-        if not post_build_script and not python_depset:
+        if not post_build_script and not python_depset and not runtime_env:
             continue
         step_key = generate_custom_build_step_key(image)
         step_name = _get_step_name(image, step_key, custom_image_test_names_map[image])
+        env_args = ""
+        if runtime_env:
+            env_args = " ".join(
+                f"--env {k}={v}" for k, v in sorted(runtime_env.items())
+            )
+        build_cmd_parts = [
+            "bazelisk run //release:custom_byod_build --",
+            f"--image-name {image}",
+            f"--base-image {base_image}",
+        ]
+        if post_build_script:
+            build_cmd_parts.append(f"--post-build-script {post_build_script}")
+        if python_depset:
+            build_cmd_parts.append(f"--python-depset {python_depset}")
+        if env_args:
+            build_cmd_parts.append(env_args)
+        build_cmd = " ".join(build_cmd_parts)
         step = {
             "label": step_name,
             "key": step_key,
@@ -77,7 +106,7 @@ def create_custom_build_yaml(destination_file: str, tests: List[Test]) -> None:
                 "bash release/azure_docker_login.sh",
                 f"az acr login --name {AZURE_REGISTRY_NAME}",
                 f"aws ecr get-login-password --region {config['byod_ecr_region']} | docker login --username AWS --password-stdin {config['byod_ecr']}",
-                f"bazelisk run //release:custom_byod_build -- --image-name {image} --base-image {base_image} {f'--post-build-script {post_build_script}' if post_build_script else ''} {f'--python-depset {python_depset}' if python_depset else ''}",
+                build_cmd,
             ],
         }
         step["depends_on"] = get_prerequisite_step(image, base_image)