NVIDIA
diff --git a/‎src/cloudai/workloads/common/nixl.py‎
Lines changed: 141 additions & 0 deletions b/‎src/cloudai/workloads/common/nixl.py‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎src/cloudai/workloads/nixl_bench/nixl_bench.py‎
Lines changed: 9 additions & 49 deletions b/‎src/cloudai/workloads/nixl_bench/nixl_bench.py‎
Lines changed: 9 additions & 49 deletions
diff --git a/‎src/cloudai/workloads/nixl_bench/report_generation_strategy.py‎
Lines changed: 5 additions & 5 deletions b/‎src/cloudai/workloads/nixl_bench/report_generation_strategy.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py‎
Lines changed: 19 additions & 69 deletions b/‎src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py‎
Lines changed: 19 additions & 69 deletions
@@ -0,0 +1,141 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import logging
+from functools import cache
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from cloudai.systems.slurm import SlurmCommandGenStrategy
+from cloudai.util.lazy_imports import lazy
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+
+class NIXLCmdGenBase(SlurmCommandGenStrategy):
+    """Base command generation strategy for NIXL-based workloads."""
+
+    @property
+    def final_env_vars(self) -> dict[str, str | list[str]]:
+        env_vars = super().final_env_vars
+        env_vars["NIXL_ETCD_NAMESPACE"] = "/nixl/kvbench/$(uuidgen)"
+        env_vars["NIXL_ETCD_ENDPOINTS"] = '"$SLURM_JOB_MASTER_NODE:2379"'
+        env_vars["SLURM_JOB_MASTER_NODE"] = "$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)"
+        return env_vars
+
+    @final_env_vars.setter
+    def final_env_vars(self, value: dict[str, str | list[str]]) -> None:
+        super().final_env_vars = value
+
+    def gen_etcd_srun_command(self, etcd_path: str) -> list[str]:
+        etcd_cmd = [
+            etcd_path,
+            "--listen-client-urls=http://0.0.0.0:2379",
+            "--advertise-client-urls=http://$SLURM_JOB_MASTER_NODE:2379",
+            "--listen-peer-urls=http://0.0.0.0:2380",
+            "--initial-advertise-peer-urls=http://$SLURM_JOB_MASTER_NODE:2380",
+            '--initial-cluster="default=http://$SLURM_JOB_MASTER_NODE:2380"',
+            "--initial-cluster-state=new",
+        ]
+        cmd = [
+            *self.gen_srun_prefix(),
+            f"--output={self.test_run.output_path.absolute() / 'etcd.log'}",
+            "--overlap",
+            "--ntasks-per-node=1",
+            "--ntasks=1",
+            "--nodelist=$SLURM_JOB_MASTER_NODE",
+            "-N1",
+            *etcd_cmd,
+            " &",
+        ]
+        return cmd
+
+    def gen_wait_for_etcd_command(self, timeout: int = 60) -> list[str]:
+        cmd = [
+            "timeout",
+            str(timeout),
+            "bash",
+            "-c",
+            '"until curl -s $NIXL_ETCD_ENDPOINTS/health > /dev/null 2>&1; do sleep 1; done" || {\n',
+            f'  echo "ETCD ($NIXL_ETCD_ENDPOINTS) was unreachable after {timeout} seconds";\n',
+            "  exit 1\n",
+            "}",
+        ]
+        return cmd
+
+    def gen_nixlbench_srun_commands(self, test_cmd: list[str], backend: str) -> list[list[str]]:
+        prefix_part = self.gen_srun_prefix()
+        bash_part = [
+            "bash",
+            "-c",
+            f'"source {(self.test_run.output_path / "env_vars.sh").absolute()}; {" ".join(test_cmd)}"',
+        ]
+        tpn_part = ["--ntasks-per-node=1", "--ntasks=1", "-N1"]
+
+        cmds = [
+            [*prefix_part, "--overlap", "--nodelist=$SLURM_JOB_MASTER_NODE", *tpn_part, *bash_part],
+        ]
+
+        if backend.upper() == "UCX":
+            nnodes, _ = self.get_cached_nodes_spec()
+            if nnodes > 1:
+                cmds = [
+                    [*prefix_part, "--overlap", f"--relative={idx}", *tpn_part, *bash_part] for idx in range(nnodes)
+                ]
+            else:
+                cmds *= max(2, nnodes)
+
+        return cmds
+
+    def create_env_vars_file(self) -> None:
+        with (self.test_run.output_path / "env_vars.sh").open("w") as f:
+            for key, value in self.final_env_vars.items():
+                if key in {"NIXL_ETCD_ENDPOINTS", "NIXL_ETCD_NAMESPACE"}:
+                    continue
+                if key == "SLURM_JOB_MASTER_NODE":  # this is an sbatch-level variable, not needed per-node
+                    continue
+                f.write(f"export {key}={value}\n")
+
+
+@cache
+def extract_nixlbench_data(stdout_file: Path) -> pd.DataFrame:
+    if not stdout_file.exists():
+        logging.debug(f"{stdout_file} not found")
+        return lazy.pd.DataFrame()
+
+    header_present, data = False, []
+    for line in stdout_file.read_text().splitlines():
+        if not header_present and (
+            "Block Size (B)      Batch Size     " in line and "Avg Lat. (us)" in line and "B/W (GB/Sec)" in line
+        ):
+            header_present = True
+            continue
+        parts = line.split()
+        if header_present and (len(parts) == 6 or len(parts) == 10):
+            if len(parts) == 6:
+                data.append([parts[0], parts[1], parts[2], parts[-1]])
+            else:
+                data.append([parts[0], parts[1], parts[3], parts[2]])
+
+    df = lazy.pd.DataFrame(data, columns=["block_size", "batch_size", "avg_lat", "bw_gb_sec"])
+    df["block_size"] = df["block_size"].astype(int)
+    df["batch_size"] = df["batch_size"].astype(int)
+    df["avg_lat"] = df["avg_lat"].astype(float)
+    df["bw_gb_sec"] = df["bw_gb_sec"].astype(float)
+
+    return df
@@ -16,34 +16,25 @@
 
 from __future__ import annotations
 
-import logging
-from functools import cache
-from pathlib import Path
-from typing import TYPE_CHECKING, Optional
-
 from cloudai.core import DockerImage, Installable, JobStatusResult, TestRun
 from cloudai.models.workload import CmdArgs, TestDefinition
-from cloudai.util.lazy_imports import lazy
-
-if TYPE_CHECKING:
-    import pandas as pd
+from cloudai.workloads.common.nixl import extract_nixlbench_data
 
 
 class NIXLBenchCmdArgs(CmdArgs):
     """Command line arguments for a NIXL Bench test."""
 
     docker_image_url: str
-    etcd_endpoint: str
     path_to_benchmark: str
+    etcd_path: str = "etcd"
+    etcd_endpoints: str = "http://$NIXL_ETCD_ENDPOINTS"
 
 
 class NIXLBenchTestDefinition(TestDefinition):
     """Test definition for a NIXL Bench test."""
 
     cmd_args: NIXLBenchCmdArgs
-    etcd_image_url: str
-    _nixl_image: Optional[DockerImage] = None
-    _etcd_image: Optional[DockerImage] = None
+    _nixl_image: DockerImage | None = None
 
     @property
     def docker_image(self) -> DockerImage:
@@ -52,47 +43,16 @@ def docker_image(self) -> DockerImage:
         return self._nixl_image
 
     @property
-    def etcd_image(self) -> DockerImage:
-        if not self._etcd_image:
-            self._etcd_image = DockerImage(url=self.etcd_image_url)
-        return self._etcd_image
+    def installables(self) -> list[Installable]:
+        return [self.docker_image, *self.git_repos]
 
     @property
-    def installables(self) -> list[Installable]:
-        return [self.docker_image, *self.git_repos, self.etcd_image]
+    def cmd_args_dict(self) -> dict[str, str | list[str]]:
+        return self.cmd_args.model_dump(exclude={"docker_image_url", "path_to_benchmark", "cmd_args", "etcd_path"})
 
     def was_run_successful(self, tr: TestRun) -> JobStatusResult:
-        df = extract_nixl_data(tr.output_path / "stdout.txt")
+        df = extract_nixlbench_data(tr.output_path / "stdout.txt")
         if df.empty:
             return JobStatusResult(is_successful=False, error_message=f"NIXLBench data not found in {tr.output_path}.")
 
         return JobStatusResult(is_successful=True)
-
-
-@cache
-def extract_nixl_data(stdout_file: Path) -> pd.DataFrame:
-    if not stdout_file.exists():
-        logging.debug(f"{stdout_file} not found")
-        return lazy.pd.DataFrame()
-
-    header_present, data = False, []
-    for line in stdout_file.read_text().splitlines():
-        if not header_present and (
-            "Block Size (B)      Batch Size     " in line and "Avg Lat. (us)" in line and "B/W (GB/Sec)" in line
-        ):
-            header_present = True
-            continue
-        parts = line.split()
-        if header_present and (len(parts) == 6 or len(parts) == 10):
-            if len(parts) == 6:
-                data.append([parts[0], parts[1], parts[2], parts[-1]])
-            else:
-                data.append([parts[0], parts[1], parts[3], parts[2]])
-
-    df = lazy.pd.DataFrame(data, columns=["block_size", "batch_size", "avg_lat", "bw_gb_sec"])
-    df["block_size"] = df["block_size"].astype(int)
-    df["batch_size"] = df["batch_size"].astype(int)
-    df["avg_lat"] = df["avg_lat"].astype(float)
-    df["bw_gb_sec"] = df["bw_gb_sec"].astype(float)
-
-    return df
@@ -23,7 +23,7 @@
 from cloudai.core import METRIC_ERROR, ReportGenerationStrategy
 from cloudai.report_generator.tool.bokeh_report_tool import BokehReportTool
 from cloudai.util.lazy_imports import lazy
-from cloudai.workloads.nixl_bench.nixl_bench import extract_nixl_data
+from cloudai.workloads.common.nixl import extract_nixlbench_data
 
 
 class NIXLBenchReportGenerationStrategy(ReportGenerationStrategy):
@@ -36,27 +36,27 @@ def results_file(self) -> Path:
         return self.test_run.output_path / "stdout.txt"
 
     def can_handle_directory(self) -> bool:
-        df = extract_nixl_data(self.results_file)
+        df = extract_nixlbench_data(self.results_file)
         return not df.empty
 
     def generate_report(self) -> None:
         if not self.can_handle_directory():
             return
 
         self.generate_bokeh_report()
-        df = extract_nixl_data(self.results_file)
+        df = extract_nixlbench_data(self.results_file)
         df.to_csv(self.test_run.output_path / "nixlbench.csv", index=False)
 
     def get_metric(self, metric: str) -> float:
         logging.debug(f"Getting metric {metric} from {self.results_file.absolute()}")
-        df = extract_nixl_data(self.results_file)
+        df = extract_nixlbench_data(self.results_file)
         if df.empty or metric not in {"default", "latency"}:
             return METRIC_ERROR
 
         return float(lazy.np.mean(df["avg_lat"]))
 
     def generate_bokeh_report(self) -> None:
-        df = extract_nixl_data(self.results_file)
+        df = extract_nixlbench_data(self.results_file)
 
         report_tool = BokehReportTool(self.test_run.output_path)
         p = report_tool.add_log_x_linear_y_multi_line_plot(
 
@@ -17,12 +17,13 @@
 from typing import cast
 
 from cloudai.core import TestRun
-from cloudai.systems.slurm import SlurmCommandGenStrategy, SlurmSystem
+from cloudai.systems.slurm import SlurmSystem
+from cloudai.workloads.common.nixl import NIXLCmdGenBase
 
 from .nixl_bench import NIXLBenchTestDefinition
 
 
-class NIXLBenchSlurmCommandGenStrategy(SlurmCommandGenStrategy):
+class NIXLBenchSlurmCommandGenStrategy(NIXLCmdGenBase):
     """Command generation strategy for NIXL Bench tests."""
 
     def __init__(self, system: SlurmSystem, test_run: TestRun) -> None:
@@ -36,88 +37,37 @@ def image_path(self) -> str | None:
     def _container_mounts(self) -> list[str]:
         return []
 
+    @property
+    def tdef(self) -> NIXLBenchTestDefinition:
+        return cast(NIXLBenchTestDefinition, self.test_run.test.test_definition)
+
     def _gen_srun_command(self) -> str:
-        with (self.test_run.output_path / "env_vars.sh").open("w") as f:
-            for key, value in self.final_env_vars.items():
-                if key == "SLURM_JOB_MASTER_NODE":  # this is an sbatch-level variable, not needed per-node
-                    continue
-                f.write(f"export {key}={value}\n")
+        self.create_env_vars_file()
 
-        etcd_command: list[str] = self.gen_etcd_srun_command()
-        nixl_commands = self.gen_nixl_srun_commands()
+        self._current_image_url = str(self.tdef.docker_image.installed_path)
+        etcd_command: list[str] = self.gen_etcd_srun_command(self.tdef.cmd_args.etcd_path)
+        nixl_commands = self.gen_nixlbench_srun_commands(
+            self.gen_nixlbench_command(), str(self.tdef.cmd_args_dict.get("backend", "unset"))
+        )
+        self._current_image_url = None
 
         commands: list[str] = [
             " ".join(etcd_command),
             "etcd_pid=$!",
-            "sleep 5",
+            " ".join(self.gen_wait_for_etcd_command()),
             *[" ".join(cmd) + " &\nsleep 15" for cmd in nixl_commands[:-1]],
             " ".join(nixl_commands[-1]),
             "kill -9 $etcd_pid",
         ]
         return "\n".join(commands)
 
-    def gen_etcd_srun_command(self) -> list[str]:
-        tdef: NIXLBenchTestDefinition = cast(NIXLBenchTestDefinition, self.test_run.test.test_definition)
-        self._current_image_url = str(tdef.etcd_image.installed_path)
-        etcd_cmd = [
-            "/usr/local/bin/etcd",
-            "--listen-client-urls",
-            "http://0.0.0.0:2379",
-            "--advertise-client-urls",
-            "http://$(hostname -I | awk '{print $1}'):2379",
-        ]
-        cmd = [
-            *self.gen_srun_prefix(),
-            "--overlap",
-            "--ntasks-per-node=1",
-            "--ntasks=1",
-            "--nodelist=$SLURM_JOB_MASTER_NODE",
-            "-N1",
-            "bash",
-            "-c",
-            f'"{" ".join(etcd_cmd)}" &',
-        ]
-        self._current_image_url = None
-        return cmd
-
     def gen_nixlbench_command(self) -> list[str]:
         tdef: NIXLBenchTestDefinition = cast(NIXLBenchTestDefinition, self.test_run.test.test_definition)
-        cmd = [tdef.cmd_args.path_to_benchmark, f"--etcd-endpoints {tdef.cmd_args.etcd_endpoint}"]
+        cmd = [tdef.cmd_args.path_to_benchmark]
 
-        other_args = tdef.cmd_args.model_dump(
-            exclude={"docker_image_url", "etcd_endpoint", "path_to_benchmark", "cmd_args"}
-        )
-        for k, v in other_args.items():
+        for k, v in tdef.cmd_args_dict.items():
+            if k == "etcd_endpoints":
+                k = "etcd-endpoints"
             cmd.append(f"--{k} {v}")
 
         return cmd
-
-    def gen_nixl_srun_commands(self) -> list[list[str]]:
-        tdef: NIXLBenchTestDefinition = cast(NIXLBenchTestDefinition, self.test_run.test.test_definition)
-        self._current_image_url = str(tdef.docker_image.installed_path)
-        prefix_part = self.gen_srun_prefix()
-        self._current_image_url = None
-
-        bash_part = [
-            "bash",
-            "-c",
-            f'"source {(self.test_run.output_path / "env_vars.sh").absolute()}; '
-            f'{" ".join(self.gen_nixlbench_command())}"',
-        ]
-        tpn_part = ["--ntasks-per-node=1", "--ntasks=1", "-N1"]
-
-        cmds = [
-            [*prefix_part, "--overlap", "--nodelist=$SLURM_JOB_MASTER_NODE", *tpn_part, *bash_part],
-        ]
-
-        backend = str(tdef.cmd_args_dict.get("backend", "unset")).upper()
-        if backend == "UCX":
-            nnodes, _ = self.get_cached_nodes_spec()
-            if nnodes > 1:
-                cmds = [
-                    [*prefix_part, "--overlap", f"--relative={idx}", *tpn_part, *bash_part] for idx in range(nnodes)
-                ]
-            else:
-                cmds *= max(2, nnodes)
-
-        return cmds