Move more functions into a common module

amaslenn · amaslenn · commit edd61a48fb39 · 2025-09-08T14:23:10.000+02:00
diff --git a/src/cloudai/workloads/common/nixl.py b/src/cloudai/workloads/common/nixl.py
@@ -13,9 +13,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
+import logging
+from functools import cache
+from pathlib import Path
+from typing import TYPE_CHECKING
 
 from cloudai.systems.slurm import SlurmCommandGenStrategy
+from cloudai.util.lazy_imports import lazy
+
+if TYPE_CHECKING:
+    import pandas as pd
 
 
 class NIXLCmdGenBase(SlurmCommandGenStrategy):
@@ -101,3 +110,32 @@ def create_env_vars_file(self) -> None:
                 if key == "SLURM_JOB_MASTER_NODE":  # this is an sbatch-level variable, not needed per-node
                     continue
                 f.write(f"export {key}={value}\n")
+
+
+@cache
+def extract_nixlbench_data(stdout_file: Path) -> pd.DataFrame:
+    if not stdout_file.exists():
+        logging.debug(f"{stdout_file} not found")
+        return lazy.pd.DataFrame()
+
+    header_present, data = False, []
+    for line in stdout_file.read_text().splitlines():
+        if not header_present and (
+            "Block Size (B)      Batch Size     " in line and "Avg Lat. (us)" in line and "B/W (GB/Sec)" in line
+        ):
+            header_present = True
+            continue
+        parts = line.split()
+        if header_present and (len(parts) == 6 or len(parts) == 10):
+            if len(parts) == 6:
+                data.append([parts[0], parts[1], parts[2], parts[-1]])
+            else:
+                data.append([parts[0], parts[1], parts[3], parts[2]])
+
+    df = lazy.pd.DataFrame(data, columns=["block_size", "batch_size", "avg_lat", "bw_gb_sec"])
+    df["block_size"] = df["block_size"].astype(int)
+    df["batch_size"] = df["batch_size"].astype(int)
+    df["avg_lat"] = df["avg_lat"].astype(float)
+    df["bw_gb_sec"] = df["bw_gb_sec"].astype(float)
+
+    return df
diff --git a/src/cloudai/workloads/nixl_bench/nixl_bench.py b/src/cloudai/workloads/nixl_bench/nixl_bench.py
@@ -16,17 +16,9 @@
 
 from __future__ import annotations
 
-import logging
-from functools import cache
-from pathlib import Path
-from typing import TYPE_CHECKING, Optional
-
 from cloudai.core import DockerImage, Installable, JobStatusResult, TestRun
 from cloudai.models.workload import CmdArgs, TestDefinition
-from cloudai.util.lazy_imports import lazy
-
-if TYPE_CHECKING:
-    import pandas as pd
+from cloudai.workloads.common.nixl import extract_nixlbench_data
 
 
 class NIXLBenchCmdArgs(CmdArgs):
@@ -42,7 +34,7 @@ class NIXLBenchTestDefinition(TestDefinition):
     """Test definition for a NIXL Bench test."""
 
     cmd_args: NIXLBenchCmdArgs
-    _nixl_image: Optional[DockerImage] = None
+    _nixl_image: DockerImage | None = None
 
     @property
     def docker_image(self) -> DockerImage:
@@ -59,37 +51,8 @@ def cmd_args_dict(self) -> dict[str, str | list[str]]:
         return self.cmd_args.model_dump(exclude={"docker_image_url", "path_to_benchmark", "cmd_args", "etcd_path"})
 
     def was_run_successful(self, tr: TestRun) -> JobStatusResult:
-        df = extract_nixl_data(tr.output_path / "stdout.txt")
+        df = extract_nixlbench_data(tr.output_path / "stdout.txt")
         if df.empty:
             return JobStatusResult(is_successful=False, error_message=f"NIXLBench data not found in {tr.output_path}.")
 
         return JobStatusResult(is_successful=True)
-
-
-@cache
-def extract_nixl_data(stdout_file: Path) -> pd.DataFrame:
-    if not stdout_file.exists():
-        logging.debug(f"{stdout_file} not found")
-        return lazy.pd.DataFrame()
-
-    header_present, data = False, []
-    for line in stdout_file.read_text().splitlines():
-        if not header_present and (
-            "Block Size (B)      Batch Size     " in line and "Avg Lat. (us)" in line and "B/W (GB/Sec)" in line
-        ):
-            header_present = True
-            continue
-        parts = line.split()
-        if header_present and (len(parts) == 6 or len(parts) == 10):
-            if len(parts) == 6:
-                data.append([parts[0], parts[1], parts[2], parts[-1]])
-            else:
-                data.append([parts[0], parts[1], parts[3], parts[2]])
-
-    df = lazy.pd.DataFrame(data, columns=["block_size", "batch_size", "avg_lat", "bw_gb_sec"])
-    df["block_size"] = df["block_size"].astype(int)
-    df["batch_size"] = df["batch_size"].astype(int)
-    df["avg_lat"] = df["avg_lat"].astype(float)
-    df["bw_gb_sec"] = df["bw_gb_sec"].astype(float)
-
-    return df
diff --git a/src/cloudai/workloads/nixl_bench/report_generation_strategy.py b/src/cloudai/workloads/nixl_bench/report_generation_strategy.py
@@ -23,7 +23,7 @@
 from cloudai.core import METRIC_ERROR, ReportGenerationStrategy
 from cloudai.report_generator.tool.bokeh_report_tool import BokehReportTool
 from cloudai.util.lazy_imports import lazy
-from cloudai.workloads.nixl_bench.nixl_bench import extract_nixl_data
+from cloudai.workloads.common.nixl import extract_nixlbench_data
 
 
 class NIXLBenchReportGenerationStrategy(ReportGenerationStrategy):
@@ -36,27 +36,27 @@ def results_file(self) -> Path:
         return self.test_run.output_path / "stdout.txt"
 
     def can_handle_directory(self) -> bool:
-        df = extract_nixl_data(self.results_file)
+        df = extract_nixlbench_data(self.results_file)
         return not df.empty
 
     def generate_report(self) -> None:
         if not self.can_handle_directory():
             return
 
         self.generate_bokeh_report()
-        df = extract_nixl_data(self.results_file)
+        df = extract_nixlbench_data(self.results_file)
         df.to_csv(self.test_run.output_path / "nixlbench.csv", index=False)
 
     def get_metric(self, metric: str) -> float:
         logging.debug(f"Getting metric {metric} from {self.results_file.absolute()}")
-        df = extract_nixl_data(self.results_file)
+        df = extract_nixlbench_data(self.results_file)
         if df.empty or metric not in {"default", "latency"}:
             return METRIC_ERROR
 
         return float(lazy.np.mean(df["avg_lat"]))
 
     def generate_bokeh_report(self) -> None:
-        df = extract_nixl_data(self.results_file)
+        df = extract_nixlbench_data(self.results_file)
 
         report_tool = BokehReportTool(self.test_run.output_path)
         p = report_tool.add_log_x_linear_y_multi_line_plot(
diff --git a/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py b/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py
@@ -19,8 +19,7 @@
 from typing import Literal
 
 from cloudai.core import CmdArgs, DockerImage, Installable, JobStatusResult, TestDefinition, TestRun
-
-from ..nixl_bench.nixl_bench import extract_nixl_data
+from cloudai.workloads.common.nixl import extract_nixlbench_data
 
 
 class NIXLKVBenchCmdArgs(CmdArgs):
@@ -67,7 +66,7 @@ def cmd_args_dict(self) -> dict[str, str | list[str]]:
         )
 
     def was_run_successful(self, tr: TestRun) -> JobStatusResult:
-        df = extract_nixl_data(tr.output_path / "stdout.txt")
+        df = extract_nixlbench_data(tr.output_path / "stdout.txt")
         if df.empty:
             return JobStatusResult(is_successful=False, error_message=f"NIXLBench data not found in {tr.output_path}.")
 
diff --git a/tests/ref_data/nixl-perftest.sbatch b/tests/ref_data/nixl-perftest.sbatch
@@ -16,7 +16,6 @@ srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mo
 
 srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
 
-echo SLURM_JOB_MASTER_NODE=$SLURM_JOB_MASTER_NODE
 srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks-per-node=1 --ntasks=1 -N1 bash -c "/workspace/nixl/.venv/bin/python /workspace/nixl/benchmark/kvbench/test/inference_workload_matgen.py generate --num-user-requests=2 --batch-size=1 --num-prefill-nodes=1 --num-decode-nodes=1 --results-dir=__OUTPUT_DIR__/output/matrices --prefill-tp=1 --prefill-pp=1 --prefill-cp=1 --decode-tp=1 --decode-pp=1 --decode-cp=1 --model=model-name"
 srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/etcd.log --overlap --ntasks-per-node=1 --ntasks=1 --nodelist=$SLURM_JOB_MASTER_NODE -N1 etcd --listen-client-urls=http://0.0.0.0:2379 --advertise-client-urls=http://$SLURM_JOB_MASTER_NODE:2379 --listen-peer-urls=http://0.0.0.0:2380 --initial-advertise-peer-urls=http://$SLURM_JOB_MASTER_NODE:2380 --initial-cluster="default=http://$SLURM_JOB_MASTER_NODE:2380" --initial-cluster-state=new  &
 etcd_pid=$!
diff --git a/tests/report_generation_strategy/test_nixl_bench_report.py b/tests/report_generation_strategy/test_nixl_bench_report.py
@@ -20,8 +20,8 @@
 
 from cloudai.core import Test, TestRun, TestTemplate
 from cloudai.systems.slurm import SlurmSystem
+from cloudai.workloads.common.nixl import extract_nixlbench_data
 from cloudai.workloads.nixl_bench import NIXLBenchCmdArgs, NIXLBenchTestDefinition
-from cloudai.workloads.nixl_bench.nixl_bench import extract_nixl_data
 
 LEGACY_FORMAT = """
 Block Size (B)      Batch Size     Avg Lat. (us)  B/W (MiB/Sec)  B/W (GiB/Sec)  B/W (GB/Sec)
@@ -67,7 +67,7 @@ def nixl_tr(tmp_path: Path, slurm_system: SlurmSystem) -> TestRun:
 )
 def test_nixl_bench_report_parsing(tmp_path: Path, sample: str, exp_latency: list[float], exp_bw: list[float]):
     (tmp_path / "nixl_bench.log").write_text(sample)
-    df = extract_nixl_data(tmp_path / "nixl_bench.log")
+    df = extract_nixlbench_data(tmp_path / "nixl_bench.log")
     assert df.shape == (4, 4)
     assert df["block_size"].tolist() == [4096, 8192, 33554432, 67108864]
     assert df["batch_size"].tolist() == [1, 1, 1, 1]