Merge pull request #576 from NVIDIA/am/nixl-summary-report

amaslenn · web-flow · commit 4806f70a2ddb · 2025-06-25T09:29:12.000+02:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,6 +26,7 @@ dependencies = [
   "pydantic==2.8.2",
   "jinja2==3.1.6",
   "websockets==15.0.1",
+  "rich==14.0.0",
 ]
 requires-python = ">=3.10"
 scripts = { cloudai = "cloudai.__main__:main" }
diff --git a/src/cloudai/registration.py b/src/cloudai/registration.py
@@ -95,6 +95,7 @@ def register_all():
         NIXLBenchJobStatusRetrievalStrategy,
         NIXLBenchReportGenerationStrategy,
         NIXLBenchSlurmCommandGenStrategy,
+        NIXLBenchSummaryReport,
         NIXLBenchTestDefinition,
     )
     from cloudai.workloads.sleep import (
@@ -313,6 +314,7 @@ def register_all():
     Registry().add_scenario_report("per_test", PerTestReporter, ReportConfig(enable=True))
     Registry().add_scenario_report("status", StatusReporter, ReportConfig(enable=True))
     Registry().add_scenario_report("tarball", TarballReporter, ReportConfig(enable=True))
+    Registry().add_scenario_report("nixl_bench_summary", NIXLBenchSummaryReport, ReportConfig(enable=True))
 
     Registry().add_reward_function("inverse", inverse_reward)
     Registry().add_reward_function("negative", negative_reward)
diff --git a/src/cloudai/util/lazy_imports.py b/src/cloudai/util/lazy_imports.py
@@ -21,6 +21,7 @@
 
 if TYPE_CHECKING:
     import bokeh
+    import bokeh.embed as bokeh_embed
     import bokeh.layouts as bokeh_layouts
     import bokeh.models as bokeh_models
     import bokeh.palettes as bokeh_pallettes
@@ -44,6 +45,7 @@ def __init__(self):
         self._bokeh_layouts: ModuleType | None = None
         self._bokeh_transform: ModuleType | None = None
         self._bokeh_pallettes: ModuleType | None = None
+        self._bokeh_embed: ModuleType | None = None
 
     @property
     def np(self) -> np:  # type: ignore[no-any-return]
@@ -133,5 +135,15 @@ def bokeh_pallettes(self) -> bokeh_pallettes:  # type: ignore[no-any-return]
 
         return cast("bokeh_pallettes", self._bokeh_pallettes)
 
+    @property
+    def bokeh_embed(self) -> bokeh_embed:  # type: ignore[no-any-return]
+        """Lazy import of bokeh.embed."""
+        if self._bokeh_embed is None:
+            import bokeh.embed as bokeh_embed
+
+            self._bokeh_embed = bokeh_embed
+
+        return cast("bokeh_embed", self._bokeh_embed)
+
 
 lazy = LazyImports()
diff --git a/src/cloudai/util/nixl_report_template.jinja2 b/src/cloudai/util/nixl_report_template.jinja2
@@ -0,0 +1,63 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+    <title>{{ title }}</title>
+    <meta charset="UTF-8">
+    <script src="https://cdn.bokeh.org/bokeh/release/bokeh-3.4.0.min.js"></script>
+    <script src="https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.4.0.min.js"></script>
+    <script src="https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.4.0.min.js"></script>
+    <style>
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+            margin: 20px;
+            line-height: 1.6;
+            color: #333;
+        }
+
+        h1 {
+            color: #333;
+            border-bottom: 2px solid #eee;
+            padding-bottom: 10px;
+            margin-bottom: 30px;
+        }
+
+        h2 {
+            color: #666;
+            margin-top: 30px;
+            margin-bottom: 15px;
+        }
+
+        .charts-section {
+            margin: 30px 0;
+        }
+
+        .tables-section {
+            margin: 30px 0;
+        }
+
+        .description {
+            color: #666;
+            font-style: italic;
+            margin-bottom: 20px;
+        }
+    </style>
+    {{ bokeh_script | safe }}
+</head>
+
+<body>
+    <h1>{{ title }}</h1>
+
+    <div class="charts-section">
+        <h2>Interactive Charts</h2>
+        <p class="description">Use the interactive tools to zoom, pan, and hover over data points. Click on legend items
+            to show/hide lines.</p>
+        {{ bokeh_div | safe }}
+    </div>
+
+    <div class="tables-section">
+        {{ rich_html | safe }}
+    </div>
+</body>
+
+</html>
diff --git a/src/cloudai/workloads/nixl_bench/__init__.py b/src/cloudai/workloads/nixl_bench/__init__.py
@@ -16,6 +16,7 @@
 
 from .job_status_retrieval_strategy import NIXLBenchJobStatusRetrievalStrategy
 from .nixl_bench import NIXLBenchCmdArgs, NIXLBenchTestDefinition
+from .nixl_summary_report import NIXLBenchSummaryReport
 from .report_generation_strategy import NIXLBenchReportGenerationStrategy
 from .slurm_command_gen_strategy import NIXLBenchSlurmCommandGenStrategy
 
@@ -24,5 +25,6 @@
     "NIXLBenchJobStatusRetrievalStrategy",
     "NIXLBenchReportGenerationStrategy",
     "NIXLBenchSlurmCommandGenStrategy",
+    "NIXLBenchSummaryReport",
     "NIXLBenchTestDefinition",
 ]
diff --git a/src/cloudai/workloads/nixl_bench/nixl_summary_report.py b/src/cloudai/workloads/nixl_bench/nixl_summary_report.py
@@ -0,0 +1,204 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import jinja2
+import toml
+from rich.console import Console
+from rich.table import Table
+
+from cloudai.core import Reporter, System, TestScenario
+from cloudai.models.scenario import ReportConfig
+from cloudai.util.lazy_imports import lazy
+
+from .nixl_bench import NIXLBenchTestDefinition
+
+if TYPE_CHECKING:
+    import bokeh.plotting as bk
+    import pandas as pd
+
+
+@dataclass
+class TdefResult:
+    """Convenience class for storing test definition and dataframe results."""
+
+    tdef: NIXLBenchTestDefinition
+    results: pd.DataFrame
+
+
+class NIXLBenchSummaryReport(Reporter):
+    """Summary report for NIXL Bench."""
+
+    def __init__(self, system: System, test_scenario: TestScenario, results_root: Path, config: ReportConfig) -> None:
+        super().__init__(system, test_scenario, results_root, config)
+        self.tdef_res: list[TdefResult] = []
+        self.metric2col = {
+            "avg_lat": "Avg. Latency (us)",
+            "bw_gb_sec": "Bandwidth (GB/sec)",
+        }
+        self.report_configs = [
+            ("READ", "bw_gb_sec"),
+            ("WRITE", "bw_gb_sec"),
+            ("READ", "avg_lat"),
+            ("WRITE", "avg_lat"),
+        ]
+
+    def generate(self) -> None:
+        self.load_tdef_with_results()
+
+        console = Console(record=True)
+        for op_type, metric in self.report_configs:
+            table = self.create_table(op_type, metric)
+            console.print(table)
+            console.print()
+
+        bokeh_script, bokeh_div = self.get_bokeh_html()
+
+        template = jinja2.Environment(
+            loader=jinja2.FileSystemLoader(Path(__file__).parent.parent.parent / "util")
+        ).get_template("nixl_report_template.jinja2")
+        html_content = template.render(
+            title=f"{self.test_scenario.name} NIXL Bench Report",
+            bokeh_script=bokeh_script,
+            bokeh_div=bokeh_div,
+            rich_html=console.export_html(),
+        )
+
+        html_file = self.results_root / "nixl_summary.html"
+        with open(html_file, "w") as f:
+            f.write(html_content)
+
+        logging.info(f"NIXL summary report created: {html_file}")
+
+    def load_tdef_with_results(self) -> None:
+        super().load_test_runs()
+        self.trs = [tr for tr in self.trs if isinstance(tr.test.test_definition, NIXLBenchTestDefinition)]
+
+        for tr in self.trs:
+            tr_file = toml.load(tr.output_path / "test-run.toml")
+            tdef = NIXLBenchTestDefinition.model_validate(tr_file["test_definition"])
+            self.tdef_res.append(TdefResult(tdef, lazy.pd.read_csv(tr.output_path / "nixlbench.csv")))
+
+    def create_table(self, op_type: str, metric: str) -> Table:
+        df = self.construct_df(op_type, metric)
+        table = Table(title=f"{self.test_scenario.name}: {op_type} {self.metric2col[metric]}", title_justify="left")
+        for col in df.columns:
+            table.add_column(col, justify="right", style="cyan")
+
+        for _, row in df.iterrows():
+            block_size = row["block_size"].astype(int)
+            batch_size = row["batch_size"].astype(int)
+            table.add_row(str(block_size), str(batch_size), *[str(x) for x in row.values[2:]])
+        return table
+
+    def get_bokeh_html(self) -> tuple[str, str]:
+        charts: list[bk.figure] = []
+        for op_type, metric in self.report_configs:
+            if chart := self.create_chart(op_type, metric):
+                charts.append(chart)
+
+        # layout with 2 charts per row
+        rows = []
+        for i in range(0, len(charts), 2):
+            if i + 1 < len(charts):
+                rows.append(lazy.bokeh_layouts.row(charts[i], charts[i + 1]))
+            else:
+                rows.append(lazy.bokeh_layouts.row(charts[i]))
+        layout = lazy.bokeh_layouts.column(*rows, name="charts_layout")
+
+        bokeh_script, bokeh_div = lazy.bokeh_embed.components(layout)
+        return bokeh_script, bokeh_div
+
+    def construct_df(self, op_type: str, metric: str) -> pd.DataFrame:
+        """
+        Construct a `DataFrame` with results for all test runs.
+
+        Block size and Batch size are taken only once assuming they are the same across all test runs.
+        `op_type` is used to filter the test runs.
+        """
+        final_df = lazy.pd.DataFrame()
+
+        for tdef_res in self.tdef_res:
+            if tdef_res.tdef.cmd_args_dict.get("op_type", "unset") != op_type:
+                continue
+            if final_df.empty:
+                final_df["block_size"] = tdef_res.results["block_size"].astype(int)
+                final_df["batch_size"] = tdef_res.results["batch_size"].astype(int)
+
+            col_name = (
+                f"{tdef_res.tdef.cmd_args_dict.get('initiator_seg_type', 'unset')}->"
+                f"{tdef_res.tdef.cmd_args_dict.get('target_seg_type', 'unset')}"
+            )
+            final_df[col_name] = tdef_res.results[metric].astype(float)
+
+        return final_df
+
+    def create_chart(self, op_type: str, metric: str) -> bk.figure | None:
+        df = self.construct_df(op_type, metric)
+        if df.empty:
+            logging.warning(f"Empty DataFrame for {op_type} {metric}")
+            return None
+
+        numeric_cols = [col for col in df.columns if col not in ["block_size", "batch_size"]]
+        grouped_df = df.groupby("block_size")[numeric_cols].mean()
+        grouped_df = grouped_df.reset_index()
+
+        colors = ["blue", "red", "green", "orange", "purple", "brown", "pink", "gray"]
+        y_columns = [(col, colors[i % len(colors)]) for i, col in enumerate(numeric_cols)]
+
+        p = lazy.bokeh_plotting.figure(
+            title=f"{op_type} {self.metric2col[metric]} vs Block Size",
+            x_axis_label="Block Size",
+            y_axis_label=self.metric2col[metric],
+            width=800,
+            height=500,
+            tools="pan,box_zoom,wheel_zoom,reset,save",
+            active_drag="pan",
+            active_scroll="wheel_zoom",
+            x_axis_type="log",
+        )
+
+        hover = lazy.bokeh_models.HoverTool(
+            tooltips=[("Block Size", "@x"), ("Value", "@y"), ("Segment Type", "@segment_type")]
+        )
+        p.add_tools(hover)
+
+        for col, color in y_columns:
+            source = lazy.bokeh_models.ColumnDataSource(
+                data={
+                    "x": grouped_df["block_size"].tolist(),
+                    "y": grouped_df[col].tolist(),
+                    "segment_type": [col] * len(grouped_df),
+                }
+            )
+
+            p.line("x", "y", source=source, line_color=color, line_width=2, legend_label=col)
+            p.scatter("x", "y", source=source, fill_color=color, size=8, legend_label=col)
+
+        p.legend.location = "top_left"
+        p.legend.click_policy = "hide"
+
+        y_max = grouped_df[numeric_cols].max().max()
+        y_min = grouped_df[numeric_cols].min().min()
+        p.y_range = lazy.bokeh_models.Range1d(start=y_min * -1 * y_max * 0.01, end=y_max * 1.1)
+
+        return p
diff --git a/tests/test_init.py b/tests/test_init.py
@@ -71,6 +71,7 @@
 from cloudai.workloads.nixl_bench import (
     NIXLBenchJobStatusRetrievalStrategy,
     NIXLBenchSlurmCommandGenStrategy,
+    NIXLBenchSummaryReport,
     NIXLBenchTestDefinition,
 )
 from cloudai.workloads.sleep import (
@@ -223,12 +224,12 @@ def test_definitions():
 
 def test_scenario_reports():
     scenario_reports = Registry().scenario_reports
-    assert list(scenario_reports.keys()) == ["per_test", "status", "tarball"]
-    assert list(scenario_reports.values()) == [PerTestReporter, StatusReporter, TarballReporter]
+    assert list(scenario_reports.keys()) == ["per_test", "status", "tarball", "nixl_bench_summary"]
+    assert list(scenario_reports.values()) == [PerTestReporter, StatusReporter, TarballReporter, NIXLBenchSummaryReport]
 
 
 def test_report_configs():
     configs = Registry().report_configs
-    assert list(configs.keys()) == ["per_test", "status", "tarball"]
+    assert list(configs.keys()) == ["per_test", "status", "tarball", "nixl_bench_summary"]
     for name, rep_config in configs.items():
         assert rep_config.enable is True, f"Report {name} is not enabled by default"

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@ dependencies = [`
`26`	`26`	`"pydantic==2.8.2",`
`27`	`27`	`"jinja2==3.1.6",`
`28`	`28`	`"websockets==15.0.1",`
	`29`	`+ "rich==14.0.0",`
`29`	`30`	`]`
`30`	`31`	`requires-python = ">=3.10"`
`31`	`32`	`scripts = { cloudai = "cloudai.__main__:main" }`