|
15 | 15 | # limitations under the License. |
16 | 16 |
|
17 | 17 | import re |
18 | | -from pathlib import Path |
19 | | -from typing import List, Optional, Tuple |
20 | 18 |
|
21 | | -import pandas as pd |
| 19 | +from cloudai import ReportGenerationStrategy, System, TestRun |
22 | 20 |
|
23 | | -from cloudai import ReportGenerationStrategy |
24 | | -from cloudai.report_generator.tool.bokeh_report_tool import BokehReportTool |
25 | | -from cloudai.report_generator.tool.csv_report_tool import CSVReportTool |
26 | | -from cloudai.report_generator.util import add_human_readable_sizes |
| 21 | +from .performance_report_generator import NcclTestPerformanceReportGenerator |
27 | 22 |
|
28 | 23 |
|
29 | 24 | class NcclTestReportGenerationStrategy(ReportGenerationStrategy): |
30 | | - """ |
31 | | - Strategy for generating reports from NCCL test outputs. |
| 25 | + """Strategy for generating reports from NCCL test outputs.""" |
32 | 26 |
|
33 | | - Visualizing bus bandwidth changes over epochs using interactive Bokeh plots. |
34 | | - """ |
| 27 | + def __init__(self, system: System, tr: TestRun) -> None: |
| 28 | + super().__init__(system, tr) |
| 29 | + self.performance_report = NcclTestPerformanceReportGenerator( |
| 30 | + self.test_run.output_path, self.test_run.name, self.test_run.sol |
| 31 | + ) |
35 | 32 |
|
36 | 33 | def can_handle_directory(self) -> bool: |
37 | 34 | stdout_path = self.test_run.output_path / "stdout.txt" |
38 | 35 | if stdout_path.exists(): |
39 | 36 | with stdout_path.open("r") as file: |
40 | 37 | content = file.read() |
41 | | - if re.search(r"out-of-place|in-place", content) and re.search( |
42 | | - r"\b(size\s+count\s+type\s+redop\s+root\s+" |
43 | | - r"time\s+algbw\s+busbw\s+#wrong\s+time\s+" |
44 | | - r"algbw\s+busbw\s+#wrong)\b", |
45 | | - content, |
46 | | - re.IGNORECASE, |
47 | | - ): |
48 | | - return True |
| 38 | + return bool( |
| 39 | + re.search(r"out-of-place|in-place", content) |
| 40 | + and re.search( |
| 41 | + r"\b(size\s+count\s+type\s+redop\s+root\s+" |
| 42 | + r"time\s+algbw\s+busbw\s+#wrong\s+time\s+" |
| 43 | + r"algbw\s+busbw\s+#wrong)\b", |
| 44 | + content, |
| 45 | + re.IGNORECASE, |
| 46 | + ) |
| 47 | + ) |
49 | 48 | return False |
50 | 49 |
|
51 | 50 | def generate_report(self) -> None: |
52 | | - report_data, _ = self._parse_output() |
53 | | - if report_data: |
54 | | - df = pd.DataFrame( |
55 | | - report_data, |
56 | | - columns=[ |
57 | | - "Size (B)", |
58 | | - "Count", |
59 | | - "Type", |
60 | | - "Redop", |
61 | | - "Root", |
62 | | - "Time (us) Out-of-place", |
63 | | - "Algbw (GB/s) Out-of-place", |
64 | | - "Busbw (GB/s) Out-of-place", |
65 | | - "#Wrong Out-of-place", |
66 | | - "Time (us) In-place", |
67 | | - "Algbw (GB/s) In-place", |
68 | | - "Busbw (GB/s) In-place", |
69 | | - "#Wrong In-place", |
70 | | - ], |
71 | | - ) |
72 | | - df["Size (B)"] = df["Size (B)"].astype(int) |
73 | | - df["Time (us) Out-of-place"] = df["Time (us) Out-of-place"].astype(float).round(1) |
74 | | - df["Time (us) In-place"] = df["Time (us) In-place"].astype(float).round(1) |
75 | | - df["Algbw (GB/s) Out-of-place"] = df["Algbw (GB/s) Out-of-place"].astype(float) |
76 | | - df["Busbw (GB/s) Out-of-place"] = df["Busbw (GB/s) Out-of-place"].astype(float) |
77 | | - df["Algbw (GB/s) In-place"] = df["Algbw (GB/s) In-place"].astype(float) |
78 | | - df["Busbw (GB/s) In-place"] = df["Busbw (GB/s) In-place"].astype(float) |
79 | | - df = add_human_readable_sizes(df, "Size (B)", "Size Human-readable") |
80 | | - |
81 | | - self._generate_bokeh_report(self.test_run.name, df) |
82 | | - self._generate_csv_report(df) |
83 | | - |
84 | | - def _parse_output(self) -> Tuple[List[List[str]], Optional[float]]: |
85 | | - """ |
86 | | - Extract data from 'stdout.txt' for report generation. |
87 | | -
|
88 | | - Returns: |
89 | | - Tuple[List[List[str]], Optional[float]]: Parsed data and optional average bus bandwidth. |
90 | | - """ |
91 | | - stdout_path = self.test_run.output_path / "stdout.txt" |
92 | | - avg_bus_bw = None |
93 | | - data = [] |
94 | | - if stdout_path.is_file(): |
95 | | - with stdout_path.open("r") as file: |
96 | | - for line in file: |
97 | | - line = line.strip() |
98 | | - if re.match(r"^\d", line): |
99 | | - data.append(re.split(r"\s+", line)) |
100 | | - content = file.read() |
101 | | - avg_bus_bw_match = re.search(r"Avg bus bandwidth\s+:\s+(\d+\.\d+)", content) |
102 | | - avg_bus_bw = float(avg_bus_bw_match.group(1)) if avg_bus_bw_match else None |
103 | | - return data, avg_bus_bw |
104 | | - |
105 | | - def _generate_bokeh_report(self, test_name: str, df: pd.DataFrame) -> None: |
106 | | - """ |
107 | | - Create and saves plots to visualize NCCL test metrics. |
108 | | -
|
109 | | - Args: |
110 | | - test_name (str): The name of the test. |
111 | | - df (pd.DataFrame): DataFrame containing the NCCL test data. |
112 | | - """ |
113 | | - report_tool = BokehReportTool(self.test_run.output_path) |
114 | | - line_plots = [ |
115 | | - ("Busbw (GB/s) Out-of-place", "blue", "Out-of-place Bus Bandwidth"), |
116 | | - ("Busbw (GB/s) In-place", "green", "In-place Bus Bandwidth"), |
117 | | - ] |
118 | | - for col_name, color, title in line_plots: |
119 | | - report_tool.add_log_x_linear_y_multi_line_plot( |
120 | | - title=f"{test_name} {title}", |
121 | | - x_column="Size (B)", |
122 | | - y_columns=[(col_name, color)], |
123 | | - x_axis_label="Message Size", |
124 | | - y_axis_label="Bandwidth (GB/s)", |
125 | | - df=df, |
126 | | - sol=self.test_run.sol, |
127 | | - ) |
128 | | - |
129 | | - combined_columns = [("Busbw (GB/s) Out-of-place", "blue"), ("Busbw (GB/s) In-place", "green")] |
130 | | - report_tool.add_log_x_linear_y_multi_line_plot( |
131 | | - title=f"{test_name} Combined Bus Bandwidth", |
132 | | - x_column="Size (B)", |
133 | | - y_columns=combined_columns, |
134 | | - x_axis_label="Message Size", |
135 | | - y_axis_label="Bandwidth (GB/s)", |
136 | | - df=df, |
137 | | - sol=self.test_run.sol, |
138 | | - ) |
139 | | - |
140 | | - report_tool.finalize_report(Path("cloudai_nccl_test_bokeh_report.html")) |
141 | | - |
142 | | - def _generate_csv_report(self, df: pd.DataFrame) -> None: |
143 | | - """ |
144 | | - Generate a CSV report from the DataFrame. |
145 | | -
|
146 | | - Args: |
147 | | - df (pd.DataFrame): DataFrame containing the NCCL test data. |
148 | | - """ |
149 | | - csv_report_tool = CSVReportTool(self.test_run.output_path) |
150 | | - csv_report_tool.set_dataframe(df) |
151 | | - csv_report_tool.finalize_report(Path("cloudai_nccl_test_csv_report.csv")) |
| 51 | + self.performance_report.generate() |
0 commit comments