Handle multi-section CSV format in AI Dynamo report generation (#620)

TaekyungHeo · web-flow · commit e4ac44556c72 · 2025-08-08T14:40:16.000-04:00
diff --git a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
@@ -16,13 +16,14 @@
 
 from __future__ import annotations
 
+import csv
 import logging
 import shutil
+from pathlib import Path
 from typing import ClassVar
 
 from cloudai.core import METRIC_ERROR, ReportGenerationStrategy
 from cloudai.systems.slurm.slurm_system import SlurmSystem
-from cloudai.util.lazy_imports import lazy
 
 
 class AIDynamoReportGenerationStrategy(ReportGenerationStrategy):
@@ -44,24 +45,50 @@ def can_handle_directory(self) -> bool:
         json_files = list(output_path.rglob("profile_genai_perf.json"))
         return len(csv_files) > 0 and len(json_files) > 0
 
-    def _read_metric_from_csv(self, metric_name: str) -> float:
+    def _find_csv_file(self) -> Path | None:
         output_path = self.test_run.output_path
-        source_csv = next(output_path.rglob("profile_genai_perf.csv"))
-
-        if source_csv.stat().st_size == 0:
-            return METRIC_ERROR
+        if not output_path.exists() or not output_path.is_dir():
+            return None
 
-        df = lazy.pd.read_csv(source_csv)
-        metric_row = df[df["Metric"] == metric_name]
+        csv_files = list(output_path.rglob("profile_genai_perf.csv"))
+        if not csv_files or csv_files[0].stat().st_size == 0:
+            return None
+
+        return csv_files[0]
+
+    def _extract_metric_value(self, header: list[str], row: list[str], metric_idx: int) -> float | None:
+        if "Value" in header:
+            value_idx = header.index("Value")
+            return float(row[value_idx].replace(",", ""))
+        elif "avg" in header:
+            avg_idx = header.index("avg")
+            return float(row[avg_idx].replace(",", ""))
+        return None
+
+    def _find_metric_in_section(self, section: list[list[str]], metric_name: str) -> float | None:
+        if not section:
+            return None
+
+        header = section[0]
+        if "Metric" not in header:
+            return None
+
+        metric_idx = header.index("Metric")
+        for row in section[1:]:
+            if row[metric_idx] == metric_name:
+                return self._extract_metric_value(header, row, metric_idx)
+        return None
 
-        if metric_row.empty:
+    def _read_metric_from_csv(self, metric_name: str) -> float:
+        source_csv = self._find_csv_file()
+        if not source_csv:
             return METRIC_ERROR
 
-        if "Value" in df.columns and not metric_row["Value"].empty:
-            return float(metric_row["Value"].iloc[0])
-
-        if "avg" in df.columns and not metric_row["avg"].empty:
-            return float(metric_row["avg"].iloc[0].replace(",", ""))
+        sections = self._read_csv_sections(source_csv)
+        for section in sections:
+            value = self._find_metric_in_section(section, metric_name)
+            if value is not None:
+                return value
 
         return METRIC_ERROR
 
@@ -85,36 +112,76 @@ def get_metric(self, metric: str) -> float:
 
         return self._read_metric_from_csv(mapped_metric)
 
-    def generate_report(self) -> None:
-        output_path = self.test_run.output_path
-        source_csv = next(output_path.rglob("profile_genai_perf.csv"))
-        target_csv = output_path / "report.csv"
-
-        shutil.copy2(source_csv, target_csv)
-
+    def _calculate_total_gpus(self) -> int | None:
         gpus_per_node = None
         if isinstance(self.system, SlurmSystem):
             gpus_per_node = self.system.gpus_per_node
 
         if gpus_per_node is None:
-            logging.warning("gpus_per_node is None, skipping Overall Output Tokens per Second per GPU calculation.")
-            return
+            return None
 
         num_frontend_nodes = 1
         num_prefill_nodes = self.test_run.test.test_definition.cmd_args.dynamo.prefill_worker.num_nodes
         num_decode_nodes = self.test_run.test.test_definition.cmd_args.dynamo.decode_worker.num_nodes
 
-        total_gpus = (num_frontend_nodes + num_prefill_nodes + num_decode_nodes) * gpus_per_node
+        return (num_frontend_nodes + num_prefill_nodes + num_decode_nodes) * gpus_per_node
+
+    def _read_csv_sections(self, source_csv: Path) -> list[list[list[str]]]:
+        sections = []
+        current_section = []
 
         with open(source_csv, "r") as f:
-            lines = f.readlines()
-            output_token_throughput_line = next(
-                (line for line in lines if "Output Token Throughput (tokens/sec)" in line), None
-            )
-            if output_token_throughput_line:
-                output_token_throughput = float(output_token_throughput_line.split(",")[1].strip())
+            csv_reader = csv.reader(f)
+            for row in csv_reader:
+                if not any(row):  # Empty row indicates section break
+                    if current_section:
+                        sections.append(current_section)
+                        current_section = []
+                else:
+                    current_section.append(row)
+            if current_section:
+                sections.append(current_section)
+
+        return sections
+
+    def _write_sections_with_metric(
+        self, target_csv: Path, sections: list[list[list[str]]], total_gpus: int | None
+    ) -> None:
+        with open(target_csv, "w", newline="") as f:
+            writer = csv.writer(f)
+
+            # Write first section (statistical metrics)
+            if sections:
+                for row in sections[0]:
+                    writer.writerow(row)
+                writer.writerow([])  # Empty row for section break
+
+            # Write second section with additional metric if total_gpus is available
+            if len(sections) > 1:
+                for row in sections[1]:
+                    writer.writerow(row)
+                    if total_gpus and row and row[0] == "Output Token Throughput (tokens/sec)":
+                        throughput = float(row[1].replace(",", ""))
+                        per_gpu_throughput = throughput / total_gpus
+                        writer.writerow(["Overall Output Tokens per Second per GPU", per_gpu_throughput])
+                writer.writerow([])  # Empty row for section break
+
+            # Write remaining sections
+            for section in sections[2:]:
+                for row in section:
+                    writer.writerow(row)
+                writer.writerow([])  # Empty row for section break
 
-                overall_output_tokens_per_second_per_gpu = output_token_throughput / total_gpus
+    def generate_report(self) -> None:
+        output_path = self.test_run.output_path
+        source_csv = next(output_path.rglob("profile_genai_perf.csv"))
+        target_csv = output_path / "report.csv"
+
+        total_gpus = self._calculate_total_gpus()
+        if total_gpus is None:
+            logging.warning("gpus_per_node is None, skipping Overall Output Tokens per Second per GPU calculation.")
+            shutil.copy2(source_csv, target_csv)
+            return
 
-                with open(target_csv, "a") as f:
-                    f.write(f"Overall Output Tokens per Second per GPU,{overall_output_tokens_per_second_per_gpu}\n")
+        sections = self._read_csv_sections(source_csv)
+        self._write_sections_with_metric(target_csv, sections, total_gpus)
diff --git a/tests/report_generation_strategy/test_ai_dynamo_report_generation_strategy.py b/tests/report_generation_strategy/test_ai_dynamo_report_generation_strategy.py
@@ -42,11 +42,18 @@ def get_csv_content() -> str:
         '"7777.77","8888.88","9999.99"\n'
         "Inter Token Latency (ms),12.34,23.45,34.56,45.67,56.78,67.89,78.90,89.01,90.12\n"
         "Output Sequence Length (tokens),101.01,202.02,303.03,404.04,505.05,606.06,707.07,808.08,909.09\n"
-        "Input Sequence Length (tokens),123.45,234.56,345.67,456.78,567.89,678.90,789.01,890.12,901.23\n\n"
+        "Input Sequence Length (tokens),123.45,234.56,345.67,456.78,567.89,678.90,789.01,890.12,901.23\n"
+        "\n"
         "Metric,Value\n"
         "Output Token Throughput (tokens/sec),24\n"
         "Request Throughput (per sec),1.23\n"
         "Request Count (count),40.00\n"
+        "\n"
+        "Metric,GPU,avg,min,max,p99,p95,p90,p75,p50,p25\n"
+        "GPU Power Usage (W),0,119.93,117.61,120.81,120.81,120.81,120.81,120.81,120.60,119.85\n"
+        "GPU Power Usage (W),1,120.50,120.49,120.52,120.52,120.52,120.52,120.52,120.50,120.49\n"
+        "GPU Memory Used (GB),0,84.11,82.41,84.68,84.68,84.68,84.68,84.68,84.67,84.11\n"
+        "GPU Memory Used (GB),1,82.44,82.44,82.44,82.44,82.44,82.44,82.44,82.44,82.44\n"
     )
 
 
@@ -116,8 +123,31 @@ def test_ai_dynamo_generate_report(slurm_system: SlurmSystem, ai_dynamo_tr: Test
     assert report_file.is_file(), "Report CSV was not generated."
 
     report_content = report_file.read_text()
-    expected_content = csv_content + "Overall Output Tokens per Second per GPU,1.0\n"
-    assert report_content == expected_content, "Report content does not match expected."
+
+    def split_into_sections(content: str) -> list[str]:
+        sections = content.split("\n\n")
+        return [s.strip() for s in sections if s.strip()]
+
+    def normalize_csv_section(section: str) -> str:
+        return section.replace('"', "").strip()
+
+    actual_sections = [normalize_csv_section(s) for s in split_into_sections(report_content)]
+    expected_sections = [normalize_csv_section(s) for s in split_into_sections(csv_content)]
+
+    # First section should match after normalization
+    assert actual_sections[0] == expected_sections[0], "First section (metrics) does not match"
+
+    # Second section should have our additional metric
+    second_section_lines = actual_sections[1].split("\n")
+    assert second_section_lines[0] == "Metric,Value", "Second section header does not match"
+    assert second_section_lines[1] == "Output Token Throughput (tokens/sec),24", "Throughput line does not match"
+    assert second_section_lines[2] == "Overall Output Tokens per Second per GPU,1.0", "Added metric line is incorrect"
+    assert second_section_lines[3:] == ["Request Throughput (per sec),1.23", "Request Count (count),40.00"], (
+        "Remaining lines do not match"
+    )
+
+    # Third section (GPU metrics) should be identical
+    assert actual_sections[2] == expected_sections[2], "Third section (GPU metrics) does not match"
 
 
 def test_ai_dynamo_get_metric_single_values(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None: