google
diff --git a/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/pytree_checkpoint_benchmark_pathways.yaml‎
Lines changed: 17 additions & 18 deletions b/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/configs/pytree_checkpoint_benchmark_pathways.yaml‎
Lines changed: 17 additions & 18 deletions
diff --git a/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/config_parsing.py‎
Lines changed: 4 additions & 1 deletion b/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/config_parsing.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/core.py‎
Lines changed: 8 additions & 1 deletion b/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/core.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/core_test.py‎
Lines changed: 12 additions & 0 deletions b/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/core_test.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/metric.py‎
Lines changed: 175 additions & 51 deletions b/‎checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/metric.py‎
Lines changed: 175 additions & 51 deletions
@@ -22,22 +22,22 @@ checkpoint_config:
   #   h: {dtype: "float32", shape: [256, 1048576], sharding: ["tensor", "data"]}
   #   i: {dtype: "float32", shape: [128, 2097152], sharding: ["tensor", "data"]}
   #   j: {dtype: "float32", shape: [64, 4194304], sharding: ["tensor", "data"]}
-  #   # k: {dtype: "float32", shape: [32, 8388608], sharding: ["tensor", "data"]}
-  #   # l: {dtype: "float32", shape: [16, 16777216], sharding: ["tensor", "data"]}
-  #   # m: {dtype: "float32", shape: [8, 33554432], sharding: ["tensor", "data"]}
-  #   # n: {dtype: "float32", shape: [4, 67108864], sharding: ["tensor", "data"]}
-  #   # o: {dtype: "float32", shape: [4, 4, 16777216], sharding: [null, "tensor", "data"]}
-  #   # p: {dtype: "float32", shape: [4, 8, 8388608], sharding: [null, "tensor", "data"]}
-  #   # q: {dtype: "float32", shape: [4, 16, 4194304], sharding: [null, "tensor", "data"]}
-  #   # r: {dtype: "float32", shape: [4, 32, 2097152], sharding: [null, "tensor", "data"]}
-  #   # s: {dtype: "float32", shape: [4, 64, 1048576], sharding: [null, "tensor", "data"]}
-  #   # t: {dtype: "float32", shape: [4, 128, 524288], sharding: [null, "tensor", "data"]}
-  #   # u: {dtype: "float32", shape: [4, 256, 262144], sharding: [null, "tensor", "data"]}
-  #   # v: {dtype: "float32", shape: [4, 512, 131072], sharding: [null, "tensor", "data"]}
-  #   # w: {dtype: "float32", shape: [4, 1024, 65536], sharding: [null, "tensor", "data"]}
-  #   # x: {dtype: "float32", shape: [4, 2048, 32768], sharding: [null, "tensor", "data"]}
-  #   # y: {dtype: "float32", shape: [4, 4096, 16384], sharding: [null, "tensor", "data"]}
-  #   # z: {dtype: "float32", shape: [4, 8192, 8192], sharding: [null, "tensor", "data"]}
+  #   k: {dtype: "float32", shape: [32, 8388608], sharding: ["tensor", "data"]}
+  #   l: {dtype: "float32", shape: [16, 16777216], sharding: ["tensor", "data"]}
+  #   m: {dtype: "float32", shape: [8, 33554432], sharding: ["tensor", "data"]}
+  #   n: {dtype: "float32", shape: [4, 67108864], sharding: ["tensor", "data"]}
+  #   o: {dtype: "float32", shape: [4, 4, 16777216], sharding: [null, "tensor", "data"]}
+  #   p: {dtype: "float32", shape: [4, 8, 8388608], sharding: [null, "tensor", "data"]}
+  #   q: {dtype: "float32", shape: [4, 16, 4194304], sharding: [null, "tensor", "data"]}
+  #   r: {dtype: "float32", shape: [4, 32, 2097152], sharding: [null, "tensor", "data"]}
+  #   s: {dtype: "float32", shape: [4, 64, 1048576], sharding: [null, "tensor", "data"]}
+  #   t: {dtype: "float32", shape: [4, 128, 524288], sharding: [null, "tensor", "data"]}
+  #   u: {dtype: "float32", shape: [4, 256, 262144], sharding: [null, "tensor", "data"]}
+  #   v: {dtype: "float32", shape: [4, 512, 131072], sharding: [null, "tensor", "data"]}
+  #   w: {dtype: "float32", shape: [4, 1024, 65536], sharding: [null, "tensor", "data"]}
+  #   x: {dtype: "float32", shape: [4, 2048, 32768], sharding: [null, "tensor", "data"]}
+  #   y: {dtype: "float32", shape: [4, 4096, 16384], sharding: [null, "tensor", "data"]}
+  #   z: {dtype: "float32", shape: [4, 8192, 8192], sharding: [null, "tensor", "data"]}
 
 benchmarks:
   - generator: "orbax.checkpoint._src.testing.benchmarks.pytree_checkpoint_benchmark.PyTreeCheckpointBenchmark"
@@ -62,5 +62,4 @@ benchmarks:
         # - 4
       metric_tracemalloc_enabled: True
       metric_tensorstore_enabled: True
-      use_colocated_python: [False]
-      use_jax_array_handler: [False, True]
+      use_colocated_python: [True, False]
@@ -193,5 +193,8 @@ def create_test_suite_from_config(
     generators.append(generator)
 
   return core.TestSuite(
-      name=suite_name, benchmarks_generators=generators, num_repeats=num_repeats
+      name=suite_name,
+      benchmarks_generators=generators,
+      num_repeats=num_repeats,
+      output_dir=output_dir,
   )
@@ -374,13 +374,15 @@ def __init__(
       self,
       name: str,
       benchmarks_generators: Sequence[BenchmarksGenerator],
+      output_dir: str | None = None,
       skip_incompatible_mesh_configs: bool = True,
       num_repeats: int = 1,
   ):
     self._name = name
     self._benchmarks_generators = benchmarks_generators
     self._skip_incompatible_mesh_configs = skip_incompatible_mesh_configs
     self._num_repeats = num_repeats
+    self._output_dir = output_dir
     self._suite_metrics = metric_lib.MetricsManager(
         name=name, num_repeats=num_repeats
     )
@@ -422,12 +424,17 @@ def run(self) -> Sequence[TestResult]:
           result = benchmark.run(repeat_index=repeat_index)
           all_results.append(result)
           self._suite_metrics.add_result(
-              benchmark.name, result.metrics, result.error
+              benchmark.name, benchmark.options, result.metrics, result.error
           )
 
     if not all_results:
       logging.warning("No benchmarks were run for this suite.")
 
+    if self._output_dir is not None:
+      self._suite_metrics.export_to_tensorboard(
+          epath.Path(self._output_dir) / "tensorboard"
+      )
+
     logging.info(self._suite_metrics.generate_report())
     multihost.sync_global_processes("test_suite:run_end")
     return all_results
@@ -344,6 +344,18 @@ def test_generator_mismatched_options(self):
 
 class TestSuiteTest(parameterized.TestCase):
 
+  @mock.patch.object(metric_lib, 'MetricsManager')
+  def test_init_with_output_dir(self, mock_metrics_manager):
+    gen = MyGenerator(
+        checkpoint_configs=[configs.CheckpointConfig()],
+        options=MyBenchmarkOptions(opt1=1),
+    )
+    output_dir = '/tmp/foo'
+    core.TestSuite(
+        name='my_suite', benchmarks_generators=[gen], output_dir=output_dir
+    )
+    mock_metrics_manager.assert_called_once_with(name='my_suite', num_repeats=1)
+
   @mock.patch.object(core.Benchmark, 'run')
   def test_run(self, mock_benchmark_run):
     gen = MyGenerator(
 
@@ -26,6 +26,8 @@
 from typing import Any
 
 from absl import logging
+from clu import metric_writers
+from etils import epath
 import numpy as np
 from orbax.checkpoint._src.multihost import multihost
 import psutil
@@ -437,24 +439,146 @@ def report(self):
     logging.info("\n".join(report_lines))
 
 
+class _MetricsCollector:
+  """Internal context manager to collect specified metrics."""
+
+  def __init__(
+      self, metrics_obj: Metrics, operation_name: str, metric_keys: list[str]
+  ):
+    self.metrics_obj = metrics_obj
+    self.operation_name = operation_name
+    self._metrics: dict[str, BaseMetric] = {}
+
+    for key in metric_keys:
+      if key in METRIC_REGISTRY:
+        metric_class = METRIC_REGISTRY[key]
+        self._metrics[key] = metric_class(operation_name)
+      else:
+        logging.warning("Unknown metric key: %s", key)
+
+  def __enter__(self):
+    for metric in self._metrics.values():
+      metric.start()
+    return self
+
+  def __exit__(self, *exc):
+    for key, metric in self._metrics.items():
+      try:
+        metric_results = metric.stop()
+        self.metrics_obj._add_results(metric.name, key, metric_results)
+      except Exception as e:  # pylint: disable=broad-exception-caught
+        logging.exception("Error stopping metric %s: %s", metric.name, e)
+
+
+################################################################################
+# Aggregation and Reporting
+################################################################################
+
+
+@dataclasses.dataclass
+class AggregatedStats:
+  """Statistics aggregated over multiple benchmark repetitions.
+
+  Attributes:
+    mean: Mean value.
+    std: Standard deviation.
+    min: Minimum value.
+    max: Maximum value.
+    count: Number of values aggregated.
+  """
+
+  mean: float
+  std: float
+  min: float
+  max: float
+  count: int
+
+
 class MetricsManager:
-  """Manages metrics aggregation across multiple benchmark runs."""
+  """Manages metrics aggregation and reporting for a test suite.
+
+  This class collects metrics from multiple benchmark runs and repetitions,
+  computes aggregate statistics (mean, std, min, max), generates a
+  human-readable report for logging, and exports metrics to TensorBoard
+  if configured.
+  """
+
+  def __init__(
+      self,
+      name: str,
+      num_repeats: int,
+  ):
+    """Initializes the MetricsManager.
 
-  def __init__(self, name: str, num_repeats: int):
+    Args:
+      name: The name of the test suite.
+      num_repeats: The number of repetitions for each benchmark configuration.
+    """
     self._name = name
     self._num_repeats = num_repeats
     self._runs: dict[str, list[tuple[Metrics, Exception | None]]] = (
         collections.defaultdict(list)
     )
+    self._benchmark_options: dict[str, Any] = {}
 
   def add_result(
-      self, benchmark_name: str, metrics: Metrics, error: Exception | None
+      self,
+      benchmark_name: str,
+      options: Any,
+      metrics: Metrics,
+      error: Exception | None,
   ):
-    """Adds a result from a single benchmark run."""
+    """Adds metrics from a single benchmark run/repetition.
+
+    Args:
+      benchmark_name: The name of the benchmark configuration.
+      options: The BenchmarkOptions used for this run.
+      metrics: The Metrics object containing results for this run.
+      error: An exception if the run failed, otherwise None.
+    """
     self._runs[benchmark_name].append((metrics, error))
+    if benchmark_name not in self._benchmark_options:
+      self._benchmark_options[benchmark_name] = options
+
+  def _aggregate_metrics(
+      self, results: list[tuple[Metrics, Exception | None]]
+  ) -> tuple[dict[str, AggregatedStats], dict[str, str]]:
+    """Computes aggregate stats (mean, std, etc.) for successful runs.
+
+    Args:
+      results: A list of (Metrics, error) tuples for a benchmark configuration.
+
+    Returns:
+      A tuple containing:
+        - A dict mapping metric keys to AggregatedStats.
+        - A dict mapping metric keys to their units.
+    """
+    metrics_collector = collections.defaultdict(list)
+    metric_units = {}
+    for metrics, error in results:
+      if error is None:
+        for key, (value, unit) in metrics.results.items():
+          if isinstance(value, (int, float)):
+            metrics_collector[key].append(value)
+            metric_units[key] = unit
+
+    aggregated_stats_dict = {}
+    for key, values in metrics_collector.items():
+      aggregated_stats_dict[key] = AggregatedStats(
+          mean=np.mean(values),
+          std=np.std(values),
+          min=np.min(values),
+          max=np.max(values),
+          count=len(values),
+      )
+    return aggregated_stats_dict, metric_units
 
   def generate_report(self) -> str:
-    """Generates a report with statistics from the test results."""
+    """Generates a final string report containing aggregated metrics.
+
+    Returns:
+      A formatted string containing the full benchmark report.
+    """
     report_lines = []
     title = f" Test Suite Report: {self._name} "
     report_lines.append(f"\n{title:=^80}")
@@ -476,35 +600,29 @@ def generate_report(self) -> str:
         f" {passed_runs}, Failed: {failed_runs}"
     )
 
+    # Aggregate metrics, add to report, and write aggregates to TensorBoard
     if self._num_repeats > 1:
       report_lines.append("\n" + "-" * 80)
       report_lines.append("--- Aggregated Metrics per Benchmark ---")
       for benchmark_name, results in self._runs.items():
         if not results:
           continue
         report_lines.append(f"\nBenchmark: {benchmark_name}")
-        metrics_collector = collections.defaultdict(list)
-        metric_units = {}
-        for metrics, error in results:
-          if error is None:
-            for key, (value, unit) in metrics.results.items():
-              if isinstance(value, (int, float)):
-                metrics_collector[key].append(value)
-                metric_units[key] = unit
-        if not metrics_collector:
+
+        aggregated_stats_dict, metric_units = self._aggregate_metrics(results)
+
+        if not aggregated_stats_dict:
           report_lines.append("  No successful runs to aggregate.")
           continue
-        for key, values in metrics_collector.items():
+
+        for key, stats in aggregated_stats_dict.items():
           unit = metric_units[key]
-          mean = np.mean(values)
-          stdev = np.std(values)
-          min_val = np.min(values)
-          max_val = np.max(values)
           report_lines.append(
-              f"  {key}: {mean:.4f} +/- {stdev:.4f} {unit} (min:"
-              f" {min_val:.4f}, max: {max_val:.4f}, n={len(values)})"
+              f"  {key}: {stats.mean:.4f} +/- {stats.std:.4f} {unit} (min:"
+              f" {stats.min:.4f}, max: {stats.max:.4f}, n={stats.count})"
           )
 
+    # Report failed runs
     if failed_runs > 0:
       report_lines.append("\n" + "-" * 80)
       report_lines.append("--- Failed Runs ---")
@@ -516,36 +634,42 @@ def generate_report(self) -> str:
             if len(error_repr) > 1000:
               error_repr = error_repr[:1000] + "..."
             report_lines.append(f"Test: {metrics.name}, Error: {error_repr}")
+
     report_lines.append("\n" + "=" * 80)
     return "\n".join(report_lines)
 
-
-class _MetricsCollector:
-  """Internal context manager to collect specified metrics."""
-
-  def __init__(
-      self, metrics_obj: Metrics, operation_name: str, metric_keys: list[str]
-  ):
-    self.metrics_obj = metrics_obj
-    self.operation_name = operation_name
-    self._metrics: dict[str, BaseMetric] = {}
-
-    for key in metric_keys:
-      if key in METRIC_REGISTRY:
-        metric_class = METRIC_REGISTRY[key]
-        self._metrics[key] = metric_class(operation_name)
-      else:
-        logging.warning("Unknown metric key: %s", key)
-
-  def __enter__(self):
-    for metric in self._metrics.values():
-      metric.start()
-    return self
-
-  def __exit__(self, *exc):
-    for key, metric in self._metrics.items():
-      try:
-        metric_results = metric.stop()
-        self.metrics_obj._add_results(metric.name, key, metric_results)
-      except Exception as e:  # pylint: disable=broad-exception-caught
-        logging.exception("Error stopping metric %s: %s", metric.name, e)
+  def export_to_tensorboard(self, tensorboard_dir: epath.Path):
+    """Exports metrics to TensorBoard."""
+    logging.info("Writing per-repetition metrics to TensorBoard...")
+    for benchmark_name, results in self._runs.items():
+      is_primary_host = multihost.process_index() == 0
+      writer = metric_writers.create_default_writer(
+          tensorboard_dir,
+          just_logging=not is_primary_host,
+          collection=benchmark_name,
+      )
+      # Write metrics for each repetition
+      for i, (metrics, error) in enumerate(results):
+        if error is None:
+          for key, (value, unit) in metrics.results.items():
+            tag = f'{key}_{unit.replace("/", "_")}'
+            if isinstance(value, (int, float)):
+              writer.write_scalars(step=i, scalars={tag: value})
+            else:
+              writer.write_texts(step=i, texts={tag: str(value)})
+        else:
+          tag = "error"
+          writer.write_texts(step=i, texts={tag: f"<pre>{repr(error)}</pre>"})
+      # Write benchmark options as text
+      if self._benchmark_options[benchmark_name]:
+        writer.write_texts(
+            step=0,
+            texts={
+                "options": (
+                    f"<pre>{repr(self._benchmark_options[benchmark_name])}</pre>"
+                )
+            },
+        )
+      writer.flush()
+      writer.close()
+    logging.info("Finished writing metrics to TensorBoard.")
Original file line number	Diff line number	Diff line change
`@@ -193,5 +193,8 @@ def create_test_suite_from_config(`
`193`	`193`	`generators.append(generator)`
`194`	`194`
`195`	`195`	`return core.TestSuite(`
`196`		`- name=suite_name, benchmarks_generators=generators, num_repeats=num_repeats`
	`196`	`+ name=suite_name,`
	`197`	`+ benchmarks_generators=generators,`
	`198`	`+ num_repeats=num_repeats,`
	`199`	`+ output_dir=output_dir,`
`197`	`200`	`)`