Add a resharding benchmark. This benchmark only loads the checkpoint repeatedly. The source checkpoint is expected to have been generated on a different topology and/or different sharding. It relies on a sharding config file to dictate the new shardings for the loaded checkpoint.

cpgaffney1 · Orbax Authors · commit c421a44f362a · 2026-02-23T23:29:24.000-08:00
PiperOrigin-RevId: 874406266
diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/checkpoint_generation.py b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/checkpoint_generation.py
@@ -127,34 +127,28 @@ def _partition_axis_name(offset: int) -> str:
 
 
 
-def _get_abstract_state(
-    config: configs.CheckpointConfig,
+def get_abstract_state_with_generated_shardings(pytree_metadata: Any) -> Any:
+  abstract_state = jax.tree.map(
+      abstract_arrays.to_shape_dtype_struct, pytree_metadata
+  )
+  shardings = sharding_utils.construct_maximal_shardings(abstract_state)
+  return jax.tree.map(
+      lambda sds, sharding: jax.ShapeDtypeStruct(
+          sds.shape, sds.dtype, sharding=sharding
+      ),
+      abstract_state,
+      shardings,
+  )
+
+
+def get_abstract_state_from_sharding_config(
+    sharding_config_path: epath.Path,
+    metadata: Any,
     *,
-    use_ocdbt: bool,
-    devices: list[jax.Device] | None = None,
+    devices: list[jax.Device],
 ) -> Any:
-  """Loads sharding configuration from a JSON file."""
-  path = epath.Path(config.path)
-  devices = devices or jax.devices()
-  with checkpointer.Checkpointer(
-      pytree_checkpoint_handler.PyTreeCheckpointHandler(use_ocdbt=use_ocdbt)
-  ) as ckptr:
-    metadata = ckptr.metadata(path).item_metadata
-
-  if config.sharding_config_path is None:
-    abstract_state = jax.tree.map(
-        abstract_arrays.to_shape_dtype_struct, metadata.tree
-    )
-    shardings = sharding_utils.construct_maximal_shardings(abstract_state)
-    return jax.tree.map(
-        lambda sds, sharding: jax.ShapeDtypeStruct(
-            sds.shape, sds.dtype, sharding=sharding
-        ),
-        abstract_state,
-        shardings,
-    )
-
-  path = epath.Path(config.sharding_config_path)
+  """Loads abstract state from a JSON file."""
+  path = epath.Path(sharding_config_path)
   parsed_config = json.loads(path.read_text())
   flat_abstract_state = {}
   for k, v in parsed_config.items():
@@ -169,9 +163,28 @@ def _get_abstract_state(
             spec=jax.sharding.PartitionSpec(*v['sharding']['spec']),
         ),
     )
-    return tree_utils.from_flat_dict(
-        flat_abstract_state, metadata.tree, sep='.'
-    )
+  return tree_utils.from_flat_dict(flat_abstract_state, metadata, sep='.')
+
+
+def _get_abstract_state(
+    config: configs.CheckpointConfig,
+    *,
+    use_ocdbt: bool,
+    devices: list[jax.Device] | None = None,
+) -> Any:
+  """Creates abstract state for a provided CheckpointConfig."""
+  path = epath.Path(config.path)
+  devices = devices or jax.devices()
+  with checkpointer.Checkpointer(
+      pytree_checkpoint_handler.PyTreeCheckpointHandler(use_ocdbt=use_ocdbt)
+  ) as ckptr:
+    metadata = ckptr.metadata(path).item_metadata
+
+  if config.sharding_config_path is None:
+    return get_abstract_state_with_generated_shardings(metadata.tree)
+  return get_abstract_state_from_sharding_config(
+      epath.Path(config.sharding_config_path), metadata, devices=devices
+  )
 
 
 def load_checkpoint(config: configs.CheckpointConfig) -> Any:
diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/config_parsing.py b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/config_parsing.py
@@ -84,12 +84,6 @@ def _validate_config(config: Dict[str, Any]) -> None:
     if key not in config:
       raise ValueError(f'Missing required key in YAML config: {key}')
 
-  if 'checkpoint_config' not in config and 'checkpoint_configs' not in config:
-    raise ValueError(
-        'Missing required key in YAML config: checkpoint_config or'
-        ' checkpoint_configs'
-    )
-
   if not isinstance(config['benchmarks'], list):
     raise ValueError("'benchmarks' must be a list.")
 
@@ -137,10 +131,12 @@ def create_test_suite_from_config(
     checkpoint_configs = [
         config_lib.CheckpointConfig(**cc) for cc in config['checkpoint_configs']
     ]
-  else:
+  elif 'checkpoint_config' in config:
     checkpoint_configs = [
         config_lib.CheckpointConfig(**config['checkpoint_config'])
     ]
+  else:
+    checkpoint_configs = [config_lib.CheckpointConfig()]
 
   if 'mesh_configs' in config:
     mesh_configs = [
diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/config_parsing_test.py b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/config_parsing_test.py
@@ -109,16 +109,6 @@ def test_missing_required_keys(self, key_to_remove):
     ):
       config_parsing._validate_config(config)
 
-  def test_missing_checkpoint_config_and_configs(self):
-    config = self._get_valid_config()
-    del config['checkpoint_config']
-    with self.assertRaisesRegex(
-        ValueError,
-        'Missing required key in YAML config: checkpoint_config or'
-        ' checkpoint_configs',
-    ):
-      config_parsing._validate_config(config)
-
   def test_benchmarks_not_list(self):
     config = self._get_valid_config()
     config['benchmarks'] = {}
@@ -366,6 +356,29 @@ def test_valid_creation_with_checkpoint_configs(self, mock_import, mock_load):
         ],
     )
 
+  @mock.patch.object(config_parsing, '_load_yaml_config', autospec=True)
+  @mock.patch.object(config_parsing, '_import_class', autospec=True)
+  def test_valid_creation_no_checkpoint_config(self, mock_import, mock_load):
+    yaml_content = """
+suite_name: No Checkpoint Config
+benchmarks:
+  -
+    generator: MockGenerator
+    options:
+      param1: 10
+"""
+    mock_load.return_value = yaml.safe_load(yaml_content)
+    mock_import.return_value = MockGenerator
+
+    test_suite = config_parsing.create_test_suite_from_config('fake.yaml')
+
+    self.assertLen(test_suite._benchmarks_generators, 1)
+    # Defaults to a single empty CheckpointConfig.
+    self.assertEqual(
+        test_suite._benchmarks_generators[0]._checkpoint_configs,
+        [config_lib.CheckpointConfig()],
+    )
+
   @mock.patch.object(config_parsing, '_load_yaml_config', autospec=True)
   @mock.patch.object(config_parsing, '_import_class', autospec=True)
   def test_generator_import_fail(self, mock_import, mock_load):
diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/configs.py b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/configs.py
@@ -89,8 +89,6 @@ class CheckpointConfig:
   sharding_config_path: str | None = None
 
   def __post_init__(self):
-    if self.path is None and self.spec is None:
-      raise ValueError('Either path or spec must be provided.')
     if self.path is not None and self.spec is not None:
       raise ValueError('Only one of path or spec can be provided.')
     if self.sharding_config_path is not None and self.path is None:
diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/core.py b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/core.py
@@ -73,7 +73,7 @@ class TestContext:
   """Input object passed to each test function, providing pre-configured components for the test run.
 
   Attributes:
-    pytree: The generated or loaded checkpoint data.
+    pytree: The generated or loaded checkpoint data. May be None.
     path: The test directory path.
     options: The specific BenchmarkOptions for this test variant.
     mesh: The mesh used for sharding the checkpoint data.
@@ -82,7 +82,7 @@ class TestContext:
     local_path: The local path to store the checkpoint data.
   """
 
-  pytree: Any
+  pytree: Any | None
   path: epath.Path
   options: BenchmarkOptions  # The specific options for this test variant.
   mesh: jax.sharding.Mesh | None = None
@@ -165,20 +165,22 @@ def run(self, repeat_index: int | None = None) -> TestResult:
     ):
       multihost.sync_global_processes("benchmark:setup_test_directory")
 
-    if self.checkpoint_config.path is None:
-      data = checkpoint_generation.generate_checkpoint(
+    if self.checkpoint_config.path is not None:
+      pytree = checkpoint_generation.load_checkpoint(self.checkpoint_config)
+    elif self.checkpoint_config.spec is not None:
+      pytree = checkpoint_generation.generate_checkpoint(
           self.checkpoint_config, mesh=self.mesh
       )
     else:
-      data = checkpoint_generation.load_checkpoint(self.checkpoint_config)
+      pytree = None
 
     with benchmark_metrics.measure(
         "sync_global_processes:benchmark:setup_pytree"
     ):
       multihost.sync_global_processes("benchmark:setup_pytree")
 
     context = TestContext(
-        pytree=data,
+        pytree=pytree,
         path=path,
         options=self.options,
         mesh=self.mesh,
diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/core_test.py b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/core/core_test.py
@@ -169,6 +169,45 @@ def test_fn(context):
     mock_create_mesh.assert_called_once_with(mesh_config)
     self.assertEqual(mock_metrics_report.call_count, 2)
 
+  @mock.patch.object(directory_setup, 'setup_test_directory')
+  @mock.patch.object(checkpoint_generation, 'generate_checkpoint')
+  @mock.patch.object(checkpoint_generation, 'load_checkpoint')
+  @mock.patch.object(metric_lib.Metrics, 'report')
+  def test_run_with_empty_checkpoint_config(
+      self,
+      mock_metrics_report,
+      mock_load_checkpoint,
+      mock_generate_checkpoint,
+      mock_setup_test_directory,
+  ):
+    path = epath.Path(self.create_tempdir().full_path)
+    mock_setup_test_directory.return_value = path
+    options = MyBenchmarkOptions()
+
+    def test_fn(context):
+      self.assertIsNone(context.pytree)
+      self.assertEqual(context.path, path)
+      self.assertEqual(context.options, options)
+      self.assertIsNone(context.mesh)
+      return core.TestResult(metrics=metric_lib.Metrics())
+
+    ckpt_config = configs.CheckpointConfig()
+    benchmark = core.Benchmark(
+        test_fn=test_fn,
+        checkpoint_config=ckpt_config,
+        options=options,
+        name='test_benchmark',
+    )
+
+    benchmark.run()
+
+    mock_setup_test_directory.assert_called_once_with(
+        'test_benchmark', None, None
+    )
+    mock_generate_checkpoint.assert_not_called()
+    mock_load_checkpoint.assert_not_called()
+    self.assertEqual(mock_metrics_report.call_count, 2)
+
   @mock.patch.object(directory_setup, 'setup_test_directory')
   @mock.patch.object(checkpoint_generation, 'generate_checkpoint')
   @mock.patch.object(device_mesh, 'create_mesh')
diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/v1_benchmark.py b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/v1_benchmark.py
@@ -28,7 +28,7 @@
 from orbax.checkpoint._src.testing.benchmarks.core import metric as metric_lib
 
 
-def _metrics_to_measure(options: V1BenchmarkOptions) -> list[str]:
+def get_metrics_to_measure(options: V1BenchmarkOptions) -> list[str]:
   """Returns the list of metrics to measure."""
   metrics = ["time", "rss", "io"]
   if options.metric_tracemalloc_enabled:
@@ -73,9 +73,10 @@ class V1BenchmarkOptions(benchmarks_core.BenchmarkOptions):
   metric_tensorstore_enabled: bool = False
   use_replica_parallel: bool | Sequence[bool] = False
   enable_replica_parallel_separate_folder: bool | Sequence[bool] = False
+  chunk_byte_size: int | None | Sequence[int | None] = None
   enable_trace: bool = False
 
-  def is_valid(self):
+  def is_valid(self) -> bool:
     assert isinstance(self.use_replica_parallel, bool)
     assert isinstance(self.enable_replica_parallel_separate_folder, bool)
     if self.enable_replica_parallel_separate_folder and (
@@ -89,6 +90,9 @@ def context(self) -> ocp.Context:
     return ocp.Context(
         array_options=ocp.options.ArrayOptions(
             saving=ocp.options.ArrayOptions.Saving(
+                storage_options=ocp.options.ArrayOptions.Saving.StorageOptions(
+                    chunk_byte_size=self.chunk_byte_size,
+                ),
                 use_ocdbt=self.use_ocdbt,
                 use_zarr3=self.use_zarr3,
                 use_replica_parallel=self.use_replica_parallel,
@@ -107,6 +111,13 @@ def context(self) -> ocp.Context:
     )
 
 
+def clear_pytree(pytree: Any) -> Any:
+  """Clears the pytree to free up memory."""
+  return jax.tree.map(
+      lambda x: x.delete() if isinstance(x, jax.Array) else None, pytree
+  )
+
+
 # ==============================================================================
 # 2. Implement the Benchmark Generator
 # ==============================================================================
@@ -118,12 +129,6 @@ class V1Benchmark(benchmarks_core.BenchmarksGenerator):
   V1BenchmarkHandler with various configurations.
   """
 
-  def _clear_pytree(self, pytree: Any) -> Any:
-    """Clears the pytree to free up memory."""
-    return jax.tree.map(
-        lambda x: x.delete() if isinstance(x, jax.Array) else None, pytree
-    )
-
   def test_fn(
       self, context: benchmarks_core.TestContext
   ) -> benchmarks_core.TestResult:
@@ -147,7 +152,7 @@ def test_fn(
     assert isinstance(options, V1BenchmarkOptions)
 
     logging.info("Benchmark options: %s", pprint.pformat(options))
-    metrics_to_measure = _metrics_to_measure(options)
+    metrics_to_measure = get_metrics_to_measure(options)
 
     with ocp.Context(context=options.context):
       if options.enable_trace:
@@ -162,15 +167,15 @@ def test_fn(
           ocp.save_pytree(save_path, pytree)
         with metrics.measure("save_background", metrics_to_measure):
           pass
-      context.pytree = self._clear_pytree(context.pytree)
+      context.pytree = clear_pytree(context.pytree)
       if options.enable_trace:
         jax.profiler.stop_trace()
 
       if options.enable_trace:
         jax.profiler.start_trace(context.path / "trace_load")
       with metrics.measure("load", metrics_to_measure):
         restored_pytree = ocp.load_pytree(save_path, abstract_pytree)
-      self._clear_pytree(restored_pytree)
+      clear_pytree(restored_pytree)
       if options.enable_trace:
         jax.profiler.stop_trace()
 
diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/v1_resharding_benchmark.py b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/v1_resharding_benchmark.py
diff --git a/checkpoint/orbax/checkpoint/_src/testing/benchmarks/v1_resharding_benchmark_test.py b/checkpoint/orbax/checkpoint/_src/testing/benchmarks/v1_resharding_benchmark_test.py