Skip to content

Commit 4806f70

Browse files
authored
Merge pull request #576 from NVIDIA/am/nixl-summary-report
2 parents 82ee1dd + 0976541 commit 4806f70

File tree

7 files changed

+288
-3
lines changed

7 files changed

+288
-3
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ dependencies = [
2626
"pydantic==2.8.2",
2727
"jinja2==3.1.6",
2828
"websockets==15.0.1",
29+
"rich==14.0.0",
2930
]
3031
requires-python = ">=3.10"
3132
scripts = { cloudai = "cloudai.__main__:main" }

src/cloudai/registration.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ def register_all():
9595
NIXLBenchJobStatusRetrievalStrategy,
9696
NIXLBenchReportGenerationStrategy,
9797
NIXLBenchSlurmCommandGenStrategy,
98+
NIXLBenchSummaryReport,
9899
NIXLBenchTestDefinition,
99100
)
100101
from cloudai.workloads.sleep import (
@@ -313,6 +314,7 @@ def register_all():
313314
Registry().add_scenario_report("per_test", PerTestReporter, ReportConfig(enable=True))
314315
Registry().add_scenario_report("status", StatusReporter, ReportConfig(enable=True))
315316
Registry().add_scenario_report("tarball", TarballReporter, ReportConfig(enable=True))
317+
Registry().add_scenario_report("nixl_bench_summary", NIXLBenchSummaryReport, ReportConfig(enable=True))
316318

317319
Registry().add_reward_function("inverse", inverse_reward)
318320
Registry().add_reward_function("negative", negative_reward)

src/cloudai/util/lazy_imports.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
if TYPE_CHECKING:
2323
import bokeh
24+
import bokeh.embed as bokeh_embed
2425
import bokeh.layouts as bokeh_layouts
2526
import bokeh.models as bokeh_models
2627
import bokeh.palettes as bokeh_pallettes
@@ -44,6 +45,7 @@ def __init__(self):
4445
self._bokeh_layouts: ModuleType | None = None
4546
self._bokeh_transform: ModuleType | None = None
4647
self._bokeh_pallettes: ModuleType | None = None
48+
self._bokeh_embed: ModuleType | None = None
4749

4850
@property
4951
def np(self) -> np: # type: ignore[no-any-return]
@@ -133,5 +135,15 @@ def bokeh_pallettes(self) -> bokeh_pallettes: # type: ignore[no-any-return]
133135

134136
return cast("bokeh_pallettes", self._bokeh_pallettes)
135137

138+
@property
139+
def bokeh_embed(self) -> bokeh_embed: # type: ignore[no-any-return]
140+
"""Lazy import of bokeh.embed."""
141+
if self._bokeh_embed is None:
142+
import bokeh.embed as bokeh_embed
143+
144+
self._bokeh_embed = bokeh_embed
145+
146+
return cast("bokeh_embed", self._bokeh_embed)
147+
136148

137149
lazy = LazyImports()
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
<!DOCTYPE html>
2+
<html>
3+
4+
<head>
5+
<title>{{ title }}</title>
6+
<meta charset="UTF-8">
7+
<script src="https://cdn.bokeh.org/bokeh/release/bokeh-3.4.0.min.js"></script>
8+
<script src="https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.4.0.min.js"></script>
9+
<script src="https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.4.0.min.js"></script>
10+
<style>
11+
body {
12+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
13+
margin: 20px;
14+
line-height: 1.6;
15+
color: #333;
16+
}
17+
18+
h1 {
19+
color: #333;
20+
border-bottom: 2px solid #eee;
21+
padding-bottom: 10px;
22+
margin-bottom: 30px;
23+
}
24+
25+
h2 {
26+
color: #666;
27+
margin-top: 30px;
28+
margin-bottom: 15px;
29+
}
30+
31+
.charts-section {
32+
margin: 30px 0;
33+
}
34+
35+
.tables-section {
36+
margin: 30px 0;
37+
}
38+
39+
.description {
40+
color: #666;
41+
font-style: italic;
42+
margin-bottom: 20px;
43+
}
44+
</style>
45+
{{ bokeh_script | safe }}
46+
</head>
47+
48+
<body>
49+
<h1>{{ title }}</h1>
50+
51+
<div class="charts-section">
52+
<h2>Interactive Charts</h2>
53+
<p class="description">Use the interactive tools to zoom, pan, and hover over data points. Click on legend items
54+
to show/hide lines.</p>
55+
{{ bokeh_div | safe }}
56+
</div>
57+
58+
<div class="tables-section">
59+
{{ rich_html | safe }}
60+
</div>
61+
</body>
62+
63+
</html>

src/cloudai/workloads/nixl_bench/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
from .job_status_retrieval_strategy import NIXLBenchJobStatusRetrievalStrategy
1818
from .nixl_bench import NIXLBenchCmdArgs, NIXLBenchTestDefinition
19+
from .nixl_summary_report import NIXLBenchSummaryReport
1920
from .report_generation_strategy import NIXLBenchReportGenerationStrategy
2021
from .slurm_command_gen_strategy import NIXLBenchSlurmCommandGenStrategy
2122

@@ -24,5 +25,6 @@
2425
"NIXLBenchJobStatusRetrievalStrategy",
2526
"NIXLBenchReportGenerationStrategy",
2627
"NIXLBenchSlurmCommandGenStrategy",
28+
"NIXLBenchSummaryReport",
2729
"NIXLBenchTestDefinition",
2830
]
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from __future__ import annotations
18+
19+
import logging
20+
from dataclasses import dataclass
21+
from pathlib import Path
22+
from typing import TYPE_CHECKING
23+
24+
import jinja2
25+
import toml
26+
from rich.console import Console
27+
from rich.table import Table
28+
29+
from cloudai.core import Reporter, System, TestScenario
30+
from cloudai.models.scenario import ReportConfig
31+
from cloudai.util.lazy_imports import lazy
32+
33+
from .nixl_bench import NIXLBenchTestDefinition
34+
35+
if TYPE_CHECKING:
36+
import bokeh.plotting as bk
37+
import pandas as pd
38+
39+
40+
@dataclass
41+
class TdefResult:
42+
"""Convenience class for storing test definition and dataframe results."""
43+
44+
tdef: NIXLBenchTestDefinition
45+
results: pd.DataFrame
46+
47+
48+
class NIXLBenchSummaryReport(Reporter):
49+
"""Summary report for NIXL Bench."""
50+
51+
def __init__(self, system: System, test_scenario: TestScenario, results_root: Path, config: ReportConfig) -> None:
52+
super().__init__(system, test_scenario, results_root, config)
53+
self.tdef_res: list[TdefResult] = []
54+
self.metric2col = {
55+
"avg_lat": "Avg. Latency (us)",
56+
"bw_gb_sec": "Bandwidth (GB/sec)",
57+
}
58+
self.report_configs = [
59+
("READ", "bw_gb_sec"),
60+
("WRITE", "bw_gb_sec"),
61+
("READ", "avg_lat"),
62+
("WRITE", "avg_lat"),
63+
]
64+
65+
def generate(self) -> None:
66+
self.load_tdef_with_results()
67+
68+
console = Console(record=True)
69+
for op_type, metric in self.report_configs:
70+
table = self.create_table(op_type, metric)
71+
console.print(table)
72+
console.print()
73+
74+
bokeh_script, bokeh_div = self.get_bokeh_html()
75+
76+
template = jinja2.Environment(
77+
loader=jinja2.FileSystemLoader(Path(__file__).parent.parent.parent / "util")
78+
).get_template("nixl_report_template.jinja2")
79+
html_content = template.render(
80+
title=f"{self.test_scenario.name} NIXL Bench Report",
81+
bokeh_script=bokeh_script,
82+
bokeh_div=bokeh_div,
83+
rich_html=console.export_html(),
84+
)
85+
86+
html_file = self.results_root / "nixl_summary.html"
87+
with open(html_file, "w") as f:
88+
f.write(html_content)
89+
90+
logging.info(f"NIXL summary report created: {html_file}")
91+
92+
def load_tdef_with_results(self) -> None:
93+
super().load_test_runs()
94+
self.trs = [tr for tr in self.trs if isinstance(tr.test.test_definition, NIXLBenchTestDefinition)]
95+
96+
for tr in self.trs:
97+
tr_file = toml.load(tr.output_path / "test-run.toml")
98+
tdef = NIXLBenchTestDefinition.model_validate(tr_file["test_definition"])
99+
self.tdef_res.append(TdefResult(tdef, lazy.pd.read_csv(tr.output_path / "nixlbench.csv")))
100+
101+
def create_table(self, op_type: str, metric: str) -> Table:
102+
df = self.construct_df(op_type, metric)
103+
table = Table(title=f"{self.test_scenario.name}: {op_type} {self.metric2col[metric]}", title_justify="left")
104+
for col in df.columns:
105+
table.add_column(col, justify="right", style="cyan")
106+
107+
for _, row in df.iterrows():
108+
block_size = row["block_size"].astype(int)
109+
batch_size = row["batch_size"].astype(int)
110+
table.add_row(str(block_size), str(batch_size), *[str(x) for x in row.values[2:]])
111+
return table
112+
113+
def get_bokeh_html(self) -> tuple[str, str]:
114+
charts: list[bk.figure] = []
115+
for op_type, metric in self.report_configs:
116+
if chart := self.create_chart(op_type, metric):
117+
charts.append(chart)
118+
119+
# layout with 2 charts per row
120+
rows = []
121+
for i in range(0, len(charts), 2):
122+
if i + 1 < len(charts):
123+
rows.append(lazy.bokeh_layouts.row(charts[i], charts[i + 1]))
124+
else:
125+
rows.append(lazy.bokeh_layouts.row(charts[i]))
126+
layout = lazy.bokeh_layouts.column(*rows, name="charts_layout")
127+
128+
bokeh_script, bokeh_div = lazy.bokeh_embed.components(layout)
129+
return bokeh_script, bokeh_div
130+
131+
def construct_df(self, op_type: str, metric: str) -> pd.DataFrame:
132+
"""
133+
Construct a `DataFrame` with results for all test runs.
134+
135+
Block size and Batch size are taken only once assuming they are the same across all test runs.
136+
`op_type` is used to filter the test runs.
137+
"""
138+
final_df = lazy.pd.DataFrame()
139+
140+
for tdef_res in self.tdef_res:
141+
if tdef_res.tdef.cmd_args_dict.get("op_type", "unset") != op_type:
142+
continue
143+
if final_df.empty:
144+
final_df["block_size"] = tdef_res.results["block_size"].astype(int)
145+
final_df["batch_size"] = tdef_res.results["batch_size"].astype(int)
146+
147+
col_name = (
148+
f"{tdef_res.tdef.cmd_args_dict.get('initiator_seg_type', 'unset')}->"
149+
f"{tdef_res.tdef.cmd_args_dict.get('target_seg_type', 'unset')}"
150+
)
151+
final_df[col_name] = tdef_res.results[metric].astype(float)
152+
153+
return final_df
154+
155+
def create_chart(self, op_type: str, metric: str) -> bk.figure | None:
156+
df = self.construct_df(op_type, metric)
157+
if df.empty:
158+
logging.warning(f"Empty DataFrame for {op_type} {metric}")
159+
return None
160+
161+
numeric_cols = [col for col in df.columns if col not in ["block_size", "batch_size"]]
162+
grouped_df = df.groupby("block_size")[numeric_cols].mean()
163+
grouped_df = grouped_df.reset_index()
164+
165+
colors = ["blue", "red", "green", "orange", "purple", "brown", "pink", "gray"]
166+
y_columns = [(col, colors[i % len(colors)]) for i, col in enumerate(numeric_cols)]
167+
168+
p = lazy.bokeh_plotting.figure(
169+
title=f"{op_type} {self.metric2col[metric]} vs Block Size",
170+
x_axis_label="Block Size",
171+
y_axis_label=self.metric2col[metric],
172+
width=800,
173+
height=500,
174+
tools="pan,box_zoom,wheel_zoom,reset,save",
175+
active_drag="pan",
176+
active_scroll="wheel_zoom",
177+
x_axis_type="log",
178+
)
179+
180+
hover = lazy.bokeh_models.HoverTool(
181+
tooltips=[("Block Size", "@x"), ("Value", "@y"), ("Segment Type", "@segment_type")]
182+
)
183+
p.add_tools(hover)
184+
185+
for col, color in y_columns:
186+
source = lazy.bokeh_models.ColumnDataSource(
187+
data={
188+
"x": grouped_df["block_size"].tolist(),
189+
"y": grouped_df[col].tolist(),
190+
"segment_type": [col] * len(grouped_df),
191+
}
192+
)
193+
194+
p.line("x", "y", source=source, line_color=color, line_width=2, legend_label=col)
195+
p.scatter("x", "y", source=source, fill_color=color, size=8, legend_label=col)
196+
197+
p.legend.location = "top_left"
198+
p.legend.click_policy = "hide"
199+
200+
y_max = grouped_df[numeric_cols].max().max()
201+
y_min = grouped_df[numeric_cols].min().min()
202+
p.y_range = lazy.bokeh_models.Range1d(start=y_min * -1 * y_max * 0.01, end=y_max * 1.1)
203+
204+
return p

tests/test_init.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
from cloudai.workloads.nixl_bench import (
7272
NIXLBenchJobStatusRetrievalStrategy,
7373
NIXLBenchSlurmCommandGenStrategy,
74+
NIXLBenchSummaryReport,
7475
NIXLBenchTestDefinition,
7576
)
7677
from cloudai.workloads.sleep import (
@@ -223,12 +224,12 @@ def test_definitions():
223224

224225
def test_scenario_reports():
225226
scenario_reports = Registry().scenario_reports
226-
assert list(scenario_reports.keys()) == ["per_test", "status", "tarball"]
227-
assert list(scenario_reports.values()) == [PerTestReporter, StatusReporter, TarballReporter]
227+
assert list(scenario_reports.keys()) == ["per_test", "status", "tarball", "nixl_bench_summary"]
228+
assert list(scenario_reports.values()) == [PerTestReporter, StatusReporter, TarballReporter, NIXLBenchSummaryReport]
228229

229230

230231
def test_report_configs():
231232
configs = Registry().report_configs
232-
assert list(configs.keys()) == ["per_test", "status", "tarball"]
233+
assert list(configs.keys()) == ["per_test", "status", "tarball", "nixl_bench_summary"]
233234
for name, rep_config in configs.items():
234235
assert rep_config.enable is True, f"Report {name} is not enabled by default"

0 commit comments

Comments
 (0)