earth-mover · dcherian · Jun 24, 2025 · May 13, 2025 · Jun 6, 2025 · Jun 6, 2025
diff --git a/icechunk-python/benchmarks/test_benchmark_writes.py b/icechunk-python/benchmarks/test_benchmark_writes.py
@@ -181,7 +181,9 @@ def write():
 
 
 @pytest.mark.benchmark(group="refs-write")
-def test_write_split_manifest_refs(benchmark, splitting, large_write_dataset) -> None:
+def test_write_split_manifest_refs_full_rewrite(
+    benchmark, splitting, large_write_dataset
+) -> None:
     dataset = large_write_dataset
     config = repo_config_with(splitting=splitting)
     assert config is not None
@@ -219,3 +221,53 @@ def commit(session_from_setup):
         session_from_setup.commit("wrote refs")
 
     benchmark.pedantic(commit, setup=write_refs, iterations=1, rounds=10)
+
+
+@pytest.mark.benchmark(group="refs-write")
+def test_write_split_manifest_refs_append(
+    benchmark, splitting, large_write_dataset
+) -> None:
+    dataset = large_write_dataset
+    config = repo_config_with(splitting=splitting)
+    assert config is not None
+    if hasattr(config.manifest, "splitting"):
+        assert config.manifest.splitting == splitting
+    repo = dataset.create(config=config)
+    session = repo.writable_session(branch="main")
+    store = session.store
+    group = zarr.open_group(store, zarr_format=3)
+    group.create_array(
+        "array",
+        shape=dataset.shape,
+        chunks=dataset.chunks,
+        dtype="int8",
+        fill_value=0,
+        compressors=None,
+    )
+    session.commit("initialize")
+
+    # yuck, but I'm abusing `rounds` to do a loop and time _only_ the commit.
+    global counter
+    counter = 0
+    rounds = 10
+    num_chunks = dataset.shape[0] // dataset.chunks[0]
+    batch_size = num_chunks // rounds
+
+    def write_refs() -> Session:
+        global counter
+        session = repo.writable_session(branch="main")
+        chunks = [
+            VirtualChunkSpec(
+                index=[i], location=f"s3://foo/bar/{i}.nc", offset=0, length=1
+            )
+            for i in range(counter * batch_size, counter * batch_size + batch_size)
+        ]
+        counter += 1
+        session.store.set_virtual_refs("array", chunks)
+        # (args, kwargs)
+        return ((session,), {})
+
+    def commit(session_from_setup):
+        session_from_setup.commit("wrote refs")
+
+    benchmark.pedantic(commit, setup=write_refs, iterations=1, rounds=rounds)
diff --git a/icechunk-python/pyproject.toml b/icechunk-python/pyproject.toml
@@ -51,6 +51,7 @@ benchmark = [
   "humanize",
   "platformdirs",
   "ipdb",
+  "coiled",
 ]
 docs = [
   "scipy",

diff --git a/icechunk-python/src/session.rs b/icechunk-python/src/session.rs
@@ -165,14 +165,14 @@ impl PySession {
     pub fn merge(&self, other: &PySession, py: Python<'_>) -> PyResult<()> {
         // This is blocking function, we need to release the Gil
         py.allow_threads(move || {
-            // TODO: Bad clone
-            let changes = other.0.blocking_read().deref().changes().clone();
+            // TODO: bad clone
+            let other = other.0.blocking_read().deref().clone();
 
             pyo3_async_runtimes::tokio::get_runtime().block_on(async move {
                 self.0
                     .write()
                     .await
-                    .merge(changes)
+                    .merge(other)
                     .await
                     .map_err(PyIcechunkStoreError::SessionError)?;
                 Ok(())

diff --git a/icechunk-python/tests/test_manifest_splitting.py b/icechunk-python/tests/test_manifest_splitting.py
@@ -102,7 +102,7 @@ def test_manifest_splitting_appends():
         nchunks += math.prod(NEWSHAPE) * 2
         # the lon size goes from 17 -> 19 so one extra manifest,
         # compared to previous writes
-        nmanifests += 7 * 2
+        nmanifests += 2 * 2
 
         assert len(os.listdir(f"{tmpdir}/chunks")) == nchunks
         assert len(os.listdir(f"{tmpdir}/manifests")) == nmanifests

diff --git a/icechunk-python/tests/test_stateful_repo_ops.py b/icechunk-python/tests/test_stateful_repo_ops.py
@@ -570,7 +570,7 @@ def check_list_prefix_from_root(self) -> None:
             # need to load to dict to compare since ordering of entries might differ
             expected = json.loads(self.model[k].to_bytes())
             value = self.sync_store.get(k, default_buffer_prototype())
-            assert value is not None
+            assert value is not None, k
             actual = json.loads(value.to_bytes())
             actual_fv = actual.pop("fill_value")
             expected_fv = expected.pop("fill_value")

diff --git a/icechunk-python/tests/test_zarr/test_stateful.py b/icechunk-python/tests/test_zarr/test_stateful.py
@@ -1,6 +1,8 @@
 import json
+from collections.abc import Iterable
 from typing import Any
 
+import hypothesis.extra.numpy as npst
 import hypothesis.strategies as st
 import numpy as np
 import pytest
@@ -13,18 +15,74 @@
     run_state_machine_as_test,
 )
 
+import icechunk as ic
 import zarr
-from icechunk import Repository, in_memory_storage
+from icechunk import Repository, Storage, in_memory_storage
 from zarr.core.buffer import default_buffer_prototype
 from zarr.testing.stateful import ZarrHierarchyStateMachine
 from zarr.testing.strategies import (
+    basic_indices,
     node_names,
     np_array_and_chunks,
     numpy_arrays,
+    orthogonal_indices,
 )
 
 PROTOTYPE = default_buffer_prototype()
 
+# pytestmark = [
+#     pytest.mark.filterwarnings(
+#         "ignore::zarr.core.dtype.common.UnstableSpecificationWarning"
+#     ),
+# ]
+
+
+import functools
+
+
+def with_frequency(frequency):
+    """
+    Decorator to control how frequently a rule runs in Hypothesis stateful tests.
+
+    Args:
+        frequency: Float between 0 and 1, where 1.0 means always run,
+                  0.1 means run ~10% of the time, etc.
+
+    Usage:
+        @rule()
+        @with_frequency(0.1)  # Run ~10% of the time
+        def rare_operation(self):
+            pass
+    """
+
+    def decorator(func):
+        # Create a counter attribute name specific to this function
+        counter_attr = f"__{func.__name__}_counter"
+
+        @functools.wraps(func)
+        def wrapper(self, *args, **kwargs):
+            return func(self, *args, **kwargs)
+
+        # Add precondition that checks frequency
+        @precondition
+        def frequency_check(self):
+            # Initialize counter if it doesn't exist
+            if not hasattr(self, counter_attr):
+                setattr(self, counter_attr, 0)
+
+            # Increment counter
+            current_count = getattr(self, counter_attr) + 1
+            setattr(self, counter_attr, current_count)
+
+            # Check if we should run based on frequency
+            # This gives roughly the right frequency over many calls
+            return (current_count * frequency) % 1.0 >= (1.0 - frequency)
+
+        # Apply the precondition to the wrapped function
+        return frequency_check(wrapper)
+
+    return decorator
+
 
 @st.composite
 def chunk_paths(
@@ -39,14 +97,66 @@ def chunk_paths(
     return "/".join(map(str, blockidx[subset_slicer]))
 
 
+@st.composite
+def splitting_configs(
+    draw: st.DrawFn, *, arrays: Iterable[zarr.Array]
+) -> ic.ManifestSplittingConfig:
+    config_dict = {}
+    for array in arrays:
+        if draw(st.booleans()):
+            array_condition = ic.ManifestSplitCondition.name_matches(
+                array.path.split("/")[-1]
+            )
+        else:
+            array_condition = ic.ManifestSplitCondition.path_matches(array.path)
+        dimnames = array.metadata.dimension_names or (None,) * array.ndim
+        dimsize_axis_names = draw(
+            st.lists(
+                st.sampled_from(
+                    tuple(zip(array.shape, range(array.ndim), dimnames, strict=False))
+                ),
+                min_size=1,
+                unique=True,
+            )
+        )
+        for size, axis, dimname in dimsize_axis_names:
+            if dimname is None or draw(st.booleans()):
+                key = ic.ManifestSplitDimCondition.Axis(axis)
+            else:
+                key = ic.ManifestSplitDimCondition.DimensionName(dimname)
+            config_dict[array_condition] = {
+                key: draw(st.integers(min_value=1, max_value=size + 10))
+            }
+        return ic.ManifestSplittingConfig.from_dict(config_dict)
+
+
 # TODO: more before/after commit invariants?
 # TODO: add "/" to self.all_groups, deleting "/" seems to be problematic
 class ModifiedZarrHierarchyStateMachine(ZarrHierarchyStateMachine):
-    def __init__(self, repo: Repository) -> None:
-        self.repo = repo
-        store = repo.writable_session("main").store
+    def __init__(self, storage: Storage) -> None:
+        self.storage = storage
+        self.repo = Repository.create(self.storage)
+        store = self.repo.writable_session("main").store
         super().__init__(store)
 
+    @precondition(
+        lambda self: not self.store.session.has_uncommitted_changes
+        and bool(self.all_arrays)
+    )
+    @rule(data=st.data())
+    def reopen_with_config(self, data):
+        array_paths = data.draw(
+            st.lists(st.sampled_from(sorted(self.all_arrays)), max_size=3, unique=True)
+        )
+        arrays = tuple(zarr.open_array(self.model, path=path) for path in array_paths)
+        sconfig = data.draw(splitting_configs(arrays=arrays))
+        config = ic.RepositoryConfig(
+            inline_chunk_threshold_bytes=0, manifest=ic.ManifestConfig(splitting=sconfig)
+        )
+        note(f"reopening with splitting config {sconfig=!r}")
+        self.repo = Repository.open(self.storage, config=config)
+        self.store = self.repo.writable_session("main").store
+
     @precondition(lambda self: self.store.session.has_uncommitted_changes)
     @rule(data=st.data())
     def commit_with_check(self, data) -> None:
@@ -108,8 +218,49 @@ def add_array(
         assume(array.size > 0)
         super().add_array(data, name, array_and_chunks)
 
+    @precondition(lambda self: bool(self.all_groups))
+    @rule(data=st.data())
+    def check_list_dir(self, data: st.DataObject) -> None:
+        path = self.draw_directory(data)
+        note(f"list_dir for {path=!r}")
+        model_ls = sorted(self._sync_iter(self.model.list_dir(path)))
+        store_ls = sorted(self._sync_iter(self.store.list_dir(path)))
+        if model_ls != store_ls and set(model_ls).symmetric_difference(set(store_ls)) != {
+            "c"
+        }:
+            # Consider .list_dir("path/to/array") for an array with a single chunk.
+            # The MemoryStore model will return `"c", "zarr.json"` only if the chunk exists
+            # If that chunk was deleted, then `"c"` is not returned.
+            # LocalStore will not have this behaviour :/
+            # In Icechunk, we always return the `c` so ignore this inconsistency.
+            assert model_ls == store_ls, (model_ls, store_ls)
+
     #####  TODO: port everything below to zarr
+    @precondition(lambda self: bool(self.all_arrays))
+    @rule(data=st.data())
+    def check_array(self, data: st.DataObject) -> None:
+        path = data.draw(st.sampled_from(sorted(self.all_arrays)))
+        actual = zarr.open_array(self.store, path=path)[:]
+        expected = zarr.open_array(self.model, path=path)[:]
+        np.testing.assert_equal(actual, expected)
+
+    @precondition(lambda self: bool(self.all_arrays))
+    @rule(data=st.data())
+    def overwrite_array_orthogonal_indexing(self, data: st.DataObject) -> None:
+        array = data.draw(st.sampled_from(sorted(self.all_arrays)))
+        model_array = zarr.open_array(path=array, store=self.model)
+        store_array = zarr.open_array(path=array, store=self.store)
+        indexer, _ = data.draw(orthogonal_indices(shape=model_array.shape))
+        note(f"overwriting array orthogonal {indexer=}")
+        new_data = data.draw(
+            npst.arrays(shape=model_array.oindex[indexer].shape, dtype=model_array.dtype)
+        )
+        model_array.oindex[indexer] = new_data
+        store_array.oindex[indexer] = new_data
+
+    #####  TODO: delete after next Zarr release (Jun 18, 2025)
     @rule()
+    @with_frequency(0.25)
     def clear(self) -> None:
         note("clearing")
         import zarr
@@ -152,23 +303,6 @@ def draw_directory(self, data) -> str:
             path = array_or_group
         return path
 
-    @precondition(lambda self: bool(self.all_groups))
-    @rule(data=st.data())
-    def check_list_dir(self, data) -> None:
-        path = self.draw_directory(data)
-        note(f"list_dir for {path=!r}")
-        model_ls = sorted(self._sync_iter(self.model.list_dir(path)))
-        store_ls = sorted(self._sync_iter(self.store.list_dir(path)))
-        if model_ls != store_ls and set(model_ls).symmetric_difference(set(store_ls)) != {
-            "c"
-        }:
-            # Consider .list_dir("path/to/array") for an array with a single chunk.
-            # The MemoryStore model will return `"c", "zarr.json"` only if the chunk exists
-            # If that chunk was deleted, then `"c"` is not returned.
-            # LocalStore will not have this behaviour :/
-            # In Icechunk, we always return the `c` so ignore this inconsistency.
-            assert model_ls == store_ls, (model_ls, store_ls)
-
     @precondition(lambda self: bool(self.all_arrays))
     @rule(data=st.data())
     def delete_chunk(self, data) -> None:
@@ -182,6 +316,32 @@ def delete_chunk(self, data) -> None:
         self._sync(self.model.delete(path))
         self._sync(self.store.delete(path))
 
+    @precondition(lambda self: bool(self.all_arrays))
+    @rule(data=st.data())
+    def overwrite_array_basic_indexing(self, data) -> None:
+        array = data.draw(st.sampled_from(sorted(self.all_arrays)))
+        model_array = zarr.open_array(path=array, store=self.model)
+        store_array = zarr.open_array(path=array, store=self.store)
+        slicer = data.draw(basic_indices(shape=model_array.shape))
+        note(f"overwriting array basic {slicer=}")
+        new_data = data.draw(
+            npst.arrays(shape=model_array[slicer].shape, dtype=model_array.dtype)
+        )
+        model_array[slicer] = new_data
+        store_array[slicer] = new_data
+
+    @precondition(lambda self: bool(self.all_arrays))
+    @rule(data=st.data())
+    def resize_array(self, data) -> None:
+        array = data.draw(st.sampled_from(sorted(self.all_arrays)))
+        model_array = zarr.open_array(path=array, store=self.model)
+        store_array = zarr.open_array(path=array, store=self.store)
+        ndim = model_array.ndim
+        new_shape = data.draw(npst.array_shapes(max_dims=ndim, min_dims=ndim, min_side=1))
+        note(f"resizing array from {model_array.shape} to {new_shape}")
+        model_array.resize(new_shape)
+        store_array.resize(new_shape)
+
     @precondition(lambda self: bool(self.all_arrays) or bool(self.all_groups))
     @rule(data=st.data())
     def delete_dir(self, data) -> None:
@@ -219,10 +379,8 @@ def check_list_prefix_from_root(self) -> None:
 
 
 def test_zarr_hierarchy() -> None:
-    repo = Repository.create(in_memory_storage())
-
     def mk_test_instance_sync() -> ModifiedZarrHierarchyStateMachine:
-        return ModifiedZarrHierarchyStateMachine(repo)
+        return ModifiedZarrHierarchyStateMachine(in_memory_storage())
 
     run_state_machine_as_test(
         mk_test_instance_sync, settings=Settings(report_multiple_bugs=False)