Skip to content

Commit fa7c36a

Browse files
authored
Refactor NcclTestReportGenerationStrategy (#403)
1 parent 0ce5c57 commit fa7c36a

File tree

3 files changed

+201
-176
lines changed

3 files changed

+201
-176
lines changed
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
import re
18+
from pathlib import Path
19+
from typing import List, Optional, Tuple
20+
21+
import pandas as pd
22+
23+
from cloudai.report_generator.tool.bokeh_report_tool import BokehReportTool
24+
from cloudai.report_generator.tool.csv_report_tool import CSVReportTool
25+
from cloudai.report_generator.util import add_human_readable_sizes
26+
27+
28+
class NcclTestPerformanceReportGenerator:
29+
"""Extract and transform NCCL test output data and generates CSV & Bokeh reports."""
30+
31+
def __init__(self, output_path: Path, test_name: str, sol: Optional[float] = None):
32+
self.stdout_path = output_path / "stdout.txt"
33+
self.output_path = output_path
34+
self.test_name = test_name
35+
self.sol = sol
36+
37+
def generate(self) -> None:
38+
df, _ = self._extract_data()
39+
if df.empty:
40+
return
41+
42+
self._generate_csv_report(df)
43+
self._generate_bokeh_report(df)
44+
45+
def _extract_data(self) -> Tuple[pd.DataFrame, Optional[float]]:
46+
data, avg_bus_bw = self._parse_stdout()
47+
if not data:
48+
return pd.DataFrame(), None
49+
50+
df = pd.DataFrame(
51+
data,
52+
columns=[
53+
"Size (B)",
54+
"Count",
55+
"Type",
56+
"Redop",
57+
"Root",
58+
"Time (us) Out-of-place",
59+
"Algbw (GB/s) Out-of-place",
60+
"Busbw (GB/s) Out-of-place",
61+
"#Wrong Out-of-place",
62+
"Time (us) In-place",
63+
"Algbw (GB/s) In-place",
64+
"Busbw (GB/s) In-place",
65+
"#Wrong In-place",
66+
],
67+
)
68+
69+
df["Size (B)"] = df["Size (B)"].astype(int)
70+
df["Time (us) Out-of-place"] = df["Time (us) Out-of-place"].astype(float).round(1)
71+
df["Time (us) In-place"] = df["Time (us) In-place"].astype(float).round(1)
72+
df["Algbw (GB/s) Out-of-place"] = df["Algbw (GB/s) Out-of-place"].astype(float)
73+
df["Busbw (GB/s) Out-of-place"] = df["Busbw (GB/s) Out-of-place"].astype(float)
74+
df["Algbw (GB/s) In-place"] = df["Algbw (GB/s) In-place"].astype(float)
75+
df["Busbw (GB/s) In-place"] = df["Busbw (GB/s) In-place"].astype(float)
76+
77+
df = add_human_readable_sizes(df, "Size (B)", "Size Human-readable")
78+
return df, avg_bus_bw
79+
80+
def _parse_stdout(self) -> Tuple[List[List[str]], Optional[float]]:
81+
data = []
82+
avg_bus_bw = None
83+
84+
if not self.stdout_path.is_file():
85+
return data, avg_bus_bw
86+
87+
with self.stdout_path.open("r") as file:
88+
for line in file:
89+
line = line.strip()
90+
if re.match(r"^\d", line): # Match lines starting with a digit
91+
data.append(re.split(r"\s+", line))
92+
93+
content = file.read()
94+
avg_bus_bw_match = re.search(r"Avg bus bandwidth\s+:\s+(\d+\.\d+)", content)
95+
avg_bus_bw = float(avg_bus_bw_match.group(1)) if avg_bus_bw_match else None
96+
97+
return data, avg_bus_bw
98+
99+
def _generate_csv_report(self, df: pd.DataFrame) -> None:
100+
csv_report_tool = CSVReportTool(self.output_path)
101+
csv_report_tool.set_dataframe(df)
102+
csv_report_tool.finalize_report(Path("cloudai_nccl_test_csv_report.csv"))
103+
104+
def _generate_bokeh_report(self, df: pd.DataFrame) -> None:
105+
report_tool = BokehReportTool(self.output_path)
106+
107+
line_plots = [
108+
("Busbw (GB/s) Out-of-place", "blue", "Out-of-place Bus Bandwidth"),
109+
("Busbw (GB/s) In-place", "green", "In-place Bus Bandwidth"),
110+
]
111+
for col_name, color, title in line_plots:
112+
report_tool.add_log_x_linear_y_multi_line_plot(
113+
title=f"{self.test_name} {title}",
114+
x_column="Size (B)",
115+
y_columns=[(col_name, color)],
116+
x_axis_label="Message Size",
117+
y_axis_label="Bandwidth (GB/s)",
118+
df=df,
119+
sol=self.sol,
120+
)
121+
122+
combined_columns = [("Busbw (GB/s) Out-of-place", "blue"), ("Busbw (GB/s) In-place", "green")]
123+
report_tool.add_log_x_linear_y_multi_line_plot(
124+
title=f"{self.test_name} Combined Bus Bandwidth",
125+
x_column="Size (B)",
126+
y_columns=combined_columns,
127+
x_axis_label="Message Size",
128+
y_axis_label="Bandwidth (GB/s)",
129+
df=df,
130+
sol=self.sol,
131+
)
132+
133+
report_tool.finalize_report(Path("cloudai_nccl_test_bokeh_report.html"))

src/cloudai/workloads/nccl_test/report_generation_strategy.py

Lines changed: 19 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -15,137 +15,37 @@
1515
# limitations under the License.
1616

1717
import re
18-
from pathlib import Path
19-
from typing import List, Optional, Tuple
2018

21-
import pandas as pd
19+
from cloudai import ReportGenerationStrategy, System, TestRun
2220

23-
from cloudai import ReportGenerationStrategy
24-
from cloudai.report_generator.tool.bokeh_report_tool import BokehReportTool
25-
from cloudai.report_generator.tool.csv_report_tool import CSVReportTool
26-
from cloudai.report_generator.util import add_human_readable_sizes
21+
from .performance_report_generator import NcclTestPerformanceReportGenerator
2722

2823

2924
class NcclTestReportGenerationStrategy(ReportGenerationStrategy):
30-
"""
31-
Strategy for generating reports from NCCL test outputs.
25+
"""Strategy for generating reports from NCCL test outputs."""
3226

33-
Visualizing bus bandwidth changes over epochs using interactive Bokeh plots.
34-
"""
27+
def __init__(self, system: System, tr: TestRun) -> None:
28+
super().__init__(system, tr)
29+
self.performance_report = NcclTestPerformanceReportGenerator(
30+
self.test_run.output_path, self.test_run.name, self.test_run.sol
31+
)
3532

3633
def can_handle_directory(self) -> bool:
3734
stdout_path = self.test_run.output_path / "stdout.txt"
3835
if stdout_path.exists():
3936
with stdout_path.open("r") as file:
4037
content = file.read()
41-
if re.search(r"out-of-place|in-place", content) and re.search(
42-
r"\b(size\s+count\s+type\s+redop\s+root\s+"
43-
r"time\s+algbw\s+busbw\s+#wrong\s+time\s+"
44-
r"algbw\s+busbw\s+#wrong)\b",
45-
content,
46-
re.IGNORECASE,
47-
):
48-
return True
38+
return bool(
39+
re.search(r"out-of-place|in-place", content)
40+
and re.search(
41+
r"\b(size\s+count\s+type\s+redop\s+root\s+"
42+
r"time\s+algbw\s+busbw\s+#wrong\s+time\s+"
43+
r"algbw\s+busbw\s+#wrong)\b",
44+
content,
45+
re.IGNORECASE,
46+
)
47+
)
4948
return False
5049

5150
def generate_report(self) -> None:
52-
report_data, _ = self._parse_output()
53-
if report_data:
54-
df = pd.DataFrame(
55-
report_data,
56-
columns=[
57-
"Size (B)",
58-
"Count",
59-
"Type",
60-
"Redop",
61-
"Root",
62-
"Time (us) Out-of-place",
63-
"Algbw (GB/s) Out-of-place",
64-
"Busbw (GB/s) Out-of-place",
65-
"#Wrong Out-of-place",
66-
"Time (us) In-place",
67-
"Algbw (GB/s) In-place",
68-
"Busbw (GB/s) In-place",
69-
"#Wrong In-place",
70-
],
71-
)
72-
df["Size (B)"] = df["Size (B)"].astype(int)
73-
df["Time (us) Out-of-place"] = df["Time (us) Out-of-place"].astype(float).round(1)
74-
df["Time (us) In-place"] = df["Time (us) In-place"].astype(float).round(1)
75-
df["Algbw (GB/s) Out-of-place"] = df["Algbw (GB/s) Out-of-place"].astype(float)
76-
df["Busbw (GB/s) Out-of-place"] = df["Busbw (GB/s) Out-of-place"].astype(float)
77-
df["Algbw (GB/s) In-place"] = df["Algbw (GB/s) In-place"].astype(float)
78-
df["Busbw (GB/s) In-place"] = df["Busbw (GB/s) In-place"].astype(float)
79-
df = add_human_readable_sizes(df, "Size (B)", "Size Human-readable")
80-
81-
self._generate_bokeh_report(self.test_run.name, df)
82-
self._generate_csv_report(df)
83-
84-
def _parse_output(self) -> Tuple[List[List[str]], Optional[float]]:
85-
"""
86-
Extract data from 'stdout.txt' for report generation.
87-
88-
Returns:
89-
Tuple[List[List[str]], Optional[float]]: Parsed data and optional average bus bandwidth.
90-
"""
91-
stdout_path = self.test_run.output_path / "stdout.txt"
92-
avg_bus_bw = None
93-
data = []
94-
if stdout_path.is_file():
95-
with stdout_path.open("r") as file:
96-
for line in file:
97-
line = line.strip()
98-
if re.match(r"^\d", line):
99-
data.append(re.split(r"\s+", line))
100-
content = file.read()
101-
avg_bus_bw_match = re.search(r"Avg bus bandwidth\s+:\s+(\d+\.\d+)", content)
102-
avg_bus_bw = float(avg_bus_bw_match.group(1)) if avg_bus_bw_match else None
103-
return data, avg_bus_bw
104-
105-
def _generate_bokeh_report(self, test_name: str, df: pd.DataFrame) -> None:
106-
"""
107-
Create and saves plots to visualize NCCL test metrics.
108-
109-
Args:
110-
test_name (str): The name of the test.
111-
df (pd.DataFrame): DataFrame containing the NCCL test data.
112-
"""
113-
report_tool = BokehReportTool(self.test_run.output_path)
114-
line_plots = [
115-
("Busbw (GB/s) Out-of-place", "blue", "Out-of-place Bus Bandwidth"),
116-
("Busbw (GB/s) In-place", "green", "In-place Bus Bandwidth"),
117-
]
118-
for col_name, color, title in line_plots:
119-
report_tool.add_log_x_linear_y_multi_line_plot(
120-
title=f"{test_name} {title}",
121-
x_column="Size (B)",
122-
y_columns=[(col_name, color)],
123-
x_axis_label="Message Size",
124-
y_axis_label="Bandwidth (GB/s)",
125-
df=df,
126-
sol=self.test_run.sol,
127-
)
128-
129-
combined_columns = [("Busbw (GB/s) Out-of-place", "blue"), ("Busbw (GB/s) In-place", "green")]
130-
report_tool.add_log_x_linear_y_multi_line_plot(
131-
title=f"{test_name} Combined Bus Bandwidth",
132-
x_column="Size (B)",
133-
y_columns=combined_columns,
134-
x_axis_label="Message Size",
135-
y_axis_label="Bandwidth (GB/s)",
136-
df=df,
137-
sol=self.test_run.sol,
138-
)
139-
140-
report_tool.finalize_report(Path("cloudai_nccl_test_bokeh_report.html"))
141-
142-
def _generate_csv_report(self, df: pd.DataFrame) -> None:
143-
"""
144-
Generate a CSV report from the DataFrame.
145-
146-
Args:
147-
df (pd.DataFrame): DataFrame containing the NCCL test data.
148-
"""
149-
csv_report_tool = CSVReportTool(self.test_run.output_path)
150-
csv_report_tool.set_dataframe(df)
151-
csv_report_tool.finalize_report(Path("cloudai_nccl_test_csv_report.csv"))
51+
self.performance_report.generate()

0 commit comments

Comments
 (0)