No public description

mridul-sahu · Orbax Authors · commit 8cb9ca7336fd · 2025-11-17T10:51:46.000-08:00
PiperOrigin-RevId: 833407698
diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/core.py b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/core.py
@@ -381,44 +381,17 @@ def __init__(
     self._benchmarks_generators = benchmarks_generators
     self._skip_incompatible_mesh_configs = skip_incompatible_mesh_configs
     self._num_repeats = num_repeats
-
-  def _generate_report(self, results: Sequence[TestResult]) -> str:
-    """Generates a report from the test results."""
-    passed_count = 0
-    failed_tests = []
-    for result in results:
-      if result.is_successful():
-        passed_count += 1
-      else:
-        failed_tests.append(result)
-
-    failed_count = len(failed_tests)
-    report_lines = []
-    title = f" Test Suite Report: {self._name} "
-    report_lines.append(f"\n{title:=^80}")
-    report_lines.append(f"Total tests run: {len(results)}")
-    report_lines.append(f"Passed: {passed_count}")
-    report_lines.append(f"Failed: {failed_count}")
-
-    if failed_count > 0:
-      report_lines.append("-" * 80)
-      report_lines.append("--- Failed Tests ---")
-      for result in failed_tests:
-        error_repr = repr(result.error)
-        # Limit error length to avoid flooding logs.
-        if len(error_repr) > 1000:
-          error_repr = error_repr[:1000] + "..."
-        report_lines.append(f"Test: {result.metrics.name}, Error: {error_repr}")
-    report_lines.append("=" * 80)
-    return "\n".join(report_lines)
+    self._suite_metrics = metric_lib.MetricsManager(
+        name=name, num_repeats=num_repeats
+    )
 
   def run(self) -> Sequence[TestResult]:
     """Runs all benchmarks in the suite sequentially."""
     logging.info(
         "\n%s Running Test Suite: %s %s", "=" * 25, self._name, "=" * 25
     )
 
-    results = []
+    all_results = []
     for i, generator in enumerate(self._benchmarks_generators):
       logging.info(
           "\n%s Running Generator %d: %s %s",
@@ -432,7 +405,8 @@ def run(self) -> Sequence[TestResult]:
       )
       if not generated_benchmarks:
         logging.warning(
-            "Generator %s produced no benchmarks.", generator.__class__.__name__
+            "Generator %s produced no benchmarks.",
+            generator.__class__.__name__,
         )
         continue
 
@@ -445,11 +419,15 @@ def run(self) -> Sequence[TestResult]:
               i + 1,
               self._num_repeats,
           )
-          results.append(benchmark.run(repeat_index=repeat_index))
+          result = benchmark.run(repeat_index=repeat_index)
+          all_results.append(result)
+          self._suite_metrics.add_result(
+              benchmark.name, result.metrics, result.error
+          )
 
-    if not results:
+    if not all_results:
       logging.warning("No benchmarks were run for this suite.")
 
-    logging.info(self._generate_report(results))
+    logging.info(self._suite_metrics.generate_report())
     multihost.sync_global_processes("test_suite:run_end")
-    return results
+    return all_results
diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/core_test.py b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/core_test.py
@@ -452,10 +452,9 @@ def test_fn(self, test_context: core.TestContext) -> core.TestResult:
     report_log = report_log_call[0][0]
 
     self.assertIn(' Test Suite Report: report_suite ', report_log)
-    self.assertIn('Total tests run: 3', report_log)
-    self.assertIn('Passed: 2', report_log)
-    self.assertIn('Failed: 1', report_log)
-    self.assertIn('--- Failed Tests ---', report_log)
+    self.assertIn('Total benchmark configurations: 3', report_log)
+    self.assertIn('Total runs (1 repeats): 3, Passed: 2, Failed: 1', report_log)
+    self.assertIn('--- Failed Runs ---', report_log)
     self.assertIn("Error: ValueError('opt1=2, opt2=b failed')", report_log)
 
 
diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/metric.py b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/metric.py
@@ -14,6 +14,7 @@
 
 """Metric classes for benchmarking."""
 
+import collections
 from collections.abc import MutableMapping
 import contextlib
 import dataclasses
@@ -25,6 +26,7 @@
 from typing import Any
 
 from absl import logging
+import numpy as np
 from orbax.checkpoint._src.multihost import multihost
 import psutil
 import tensorstore as ts
@@ -435,6 +437,89 @@ def report(self):
     logging.info("\n".join(report_lines))
 
 
+class MetricsManager:
+  """Manages metrics aggregation across multiple benchmark runs."""
+
+  def __init__(self, name: str, num_repeats: int):
+    self._name = name
+    self._num_repeats = num_repeats
+    self._runs: dict[str, list[tuple[Metrics, Exception | None]]] = (
+        collections.defaultdict(list)
+    )
+
+  def add_result(
+      self, benchmark_name: str, metrics: Metrics, error: Exception | None
+  ):
+    """Adds a result from a single benchmark run."""
+    self._runs[benchmark_name].append((metrics, error))
+
+  def generate_report(self) -> str:
+    """Generates a report with statistics from the test results."""
+    report_lines = []
+    title = f" Test Suite Report: {self._name} "
+    report_lines.append(f"\n{title:=^80}")
+
+    total_runs = 0
+    passed_runs = 0
+    failed_runs = 0
+    for _, results in self._runs.items():
+      total_runs += len(results)
+      for _, error in results:
+        if error is None:
+          passed_runs += 1
+        else:
+          failed_runs += 1
+
+    report_lines.append(f"Total benchmark configurations: {len(self._runs)}")
+    report_lines.append(
+        f"Total runs ({self._num_repeats} repeats): {total_runs}, Passed:"
+        f" {passed_runs}, Failed: {failed_runs}"
+    )
+
+    if self._num_repeats > 1:
+      report_lines.append("\n" + "-" * 80)
+      report_lines.append("--- Aggregated Metrics per Benchmark ---")
+      for benchmark_name, results in self._runs.items():
+        if not results:
+          continue
+        report_lines.append(f"\nBenchmark: {benchmark_name}")
+        metrics_collector = collections.defaultdict(list)
+        metric_units = {}
+        for metrics, error in results:
+          if error is None:
+            for key, (value, unit) in metrics.results.items():
+              if isinstance(value, (int, float)):
+                metrics_collector[key].append(value)
+                metric_units[key] = unit
+        if not metrics_collector:
+          report_lines.append("  No successful runs to aggregate.")
+          continue
+        for key, values in metrics_collector.items():
+          unit = metric_units[key]
+          mean = np.mean(values)
+          stdev = np.std(values)
+          min_val = np.min(values)
+          max_val = np.max(values)
+          report_lines.append(
+              f"  {key}: {mean:.4f} +/- {stdev:.4f} {unit} (min:"
+              f" {min_val:.4f}, max: {max_val:.4f}, n={len(values)})"
+          )
+
+    if failed_runs > 0:
+      report_lines.append("\n" + "-" * 80)
+      report_lines.append("--- Failed Runs ---")
+      for _, results in self._runs.items():
+        for metrics, error in results:
+          if error is not None:
+            error_repr = repr(error)
+            # Limit error length to avoid flooding logs.
+            if len(error_repr) > 1000:
+              error_repr = error_repr[:1000] + "..."
+            report_lines.append(f"Test: {metrics.name}, Error: {error_repr}")
+    report_lines.append("\n" + "=" * 80)
+    return "\n".join(report_lines)
+
+
 class _MetricsCollector:
   """Internal context manager to collect specified metrics."""
 
diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/metric_test.py b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/metric_test.py
@@ -151,5 +151,70 @@ def test_all_metrics(self):
     self.assertIn('test_metric_tensorstore_diff_count', metrics.results)
 
 
+class MetricsManagerTest(parameterized.TestCase):
+
+  def test_add_result_and_generate_report_no_repeats(self):
+    manager = metric_lib.MetricsManager(name='Suite', num_repeats=1)
+    metrics1 = metric_lib.Metrics()
+    metrics1.results['op1_time_duration'] = (1.0, 's')
+    manager.add_result('bench1', metrics1, None)
+
+    metrics2 = metric_lib.Metrics()
+    metrics2.results['op1_time_duration'] = (2.0, 's')
+    manager.add_result('bench2', metrics2, ValueError('failure'))
+
+    report = manager.generate_report()
+    self.assertIn('Suite', report)
+    self.assertIn('Total benchmark configurations: 2', report)
+    self.assertIn('Total runs (1 repeats): 2, Passed: 1, Failed: 1', report)
+    self.assertNotIn('Aggregated Metrics', report)
+    self.assertIn('Failed Runs', report)
+    self.assertIn("Error: ValueError('failure')", report)
+
+  def test_generate_report_with_repeats_and_aggregation(self):
+    manager = metric_lib.MetricsManager(name='Suite', num_repeats=3)
+
+    # Benchmark 1, Run 1
+    m1r1 = metric_lib.Metrics()
+    m1r1.results['op_time_duration'] = (1.0, 's')
+    m1r1.results['op_rss_diff'] = (10.0, 'MB')
+    manager.add_result('bench1', m1r1, None)
+    # Benchmark 1, Run 2
+    m1r2 = metric_lib.Metrics()
+    m1r2.results['op_time_duration'] = (1.2, 's')
+    m1r2.results['op_rss_diff'] = (12.0, 'MB')
+    manager.add_result('bench1', m1r2, None)
+    # Benchmark 1, Run 3 (Failed)
+    m1r3 = metric_lib.Metrics()
+    manager.add_result('bench1', m1r3, RuntimeError('Run 3 failed'))
+
+    report = manager.generate_report()
+
+    self.assertIn('Suite', report)
+    self.assertIn('Total benchmark configurations: 1', report)
+    self.assertIn('Total runs (3 repeats): 3, Passed: 2, Failed: 1', report)
+    self.assertIn('Aggregated Metrics', report)
+    self.assertIn('Benchmark: bench1', report)
+    # mean=1.1, std=0.1, min=1.0, max=1.2
+    self.assertIn(
+        'op_time_duration: 1.1000 +/- 0.1000 s (min: 1.0000, max: 1.2000, n=2)',
+        report,
+    )
+    # mean=11.0, std=1.0, min=10.0, max=12.0
+    self.assertIn(
+        'op_rss_diff: 11.0000 +/- 1.0000 MB (min: 10.0000, max: 12.0000, n=2)',
+        report,
+    )
+    self.assertIn('Failed Runs', report)
+    self.assertIn("Error: RuntimeError('Run 3 failed')", report)
+
+  def test_generate_report_no_successful_runs_for_aggregation(self):
+    manager = metric_lib.MetricsManager(name='Suite', num_repeats=2)
+    manager.add_result('bench1', metric_lib.Metrics(), ValueError('1'))
+    manager.add_result('bench1', metric_lib.Metrics(), ValueError('2'))
+    report = manager.generate_report()
+    self.assertIn('No successful runs to aggregate', report)
+
+
 if __name__ == '__main__':
   absltest.main()