11import json
2+ from collections .abc import Iterable
23from typing import Any
34
5+ import hypothesis .extra .numpy as npst
46import hypothesis .strategies as st
57import numpy as np
68import pytest
1315 run_state_machine_as_test ,
1416)
1517
18+ import icechunk as ic
1619import zarr
17- from icechunk import Repository , in_memory_storage
20+ from icechunk import Repository , Storage , in_memory_storage
1821from zarr .core .buffer import default_buffer_prototype
1922from zarr .testing .stateful import ZarrHierarchyStateMachine
2023from zarr .testing .strategies import (
24+ basic_indices ,
2125 node_names ,
2226 np_array_and_chunks ,
2327 numpy_arrays ,
28+ orthogonal_indices ,
2429)
2530
2631PROTOTYPE = default_buffer_prototype ()
2732
33+ # pytestmark = [
34+ # pytest.mark.filterwarnings(
35+ # "ignore::zarr.core.dtype.common.UnstableSpecificationWarning"
36+ # ),
37+ # ]
38+
39+
40+ import functools
41+
42+
43+ def with_frequency (frequency ):
44+ """
45+ Decorator to control how frequently a rule runs in Hypothesis stateful tests.
46+
47+ Args:
48+ frequency: Float between 0 and 1, where 1.0 means always run,
49+ 0.1 means run ~10% of the time, etc.
50+
51+ Usage:
52+ @rule()
53+ @with_frequency(0.1) # Run ~10% of the time
54+ def rare_operation(self):
55+ pass
56+ """
57+
58+ def decorator (func ):
59+ # Create a counter attribute name specific to this function
60+ counter_attr = f"__{ func .__name__ } _counter"
61+
62+ @functools .wraps (func )
63+ def wrapper (self , * args , ** kwargs ):
64+ return func (self , * args , ** kwargs )
65+
66+ # Add precondition that checks frequency
67+ @precondition
68+ def frequency_check (self ):
69+ # Initialize counter if it doesn't exist
70+ if not hasattr (self , counter_attr ):
71+ setattr (self , counter_attr , 0 )
72+
73+ # Increment counter
74+ current_count = getattr (self , counter_attr ) + 1
75+ setattr (self , counter_attr , current_count )
76+
77+ # Check if we should run based on frequency
78+ # This gives roughly the right frequency over many calls
79+ return (current_count * frequency ) % 1.0 >= (1.0 - frequency )
80+
81+ # Apply the precondition to the wrapped function
82+ return frequency_check (wrapper )
83+
84+ return decorator
85+
2886
2987@st .composite
3088def chunk_paths (
@@ -39,14 +97,66 @@ def chunk_paths(
3997 return "/" .join (map (str , blockidx [subset_slicer ]))
4098
4199
100+ @st .composite
101+ def splitting_configs (
102+ draw : st .DrawFn , * , arrays : Iterable [zarr .Array ]
103+ ) -> ic .ManifestSplittingConfig :
104+ config_dict = {}
105+ for array in arrays :
106+ if draw (st .booleans ()):
107+ array_condition = ic .ManifestSplitCondition .name_matches (
108+ array .path .split ("/" )[- 1 ]
109+ )
110+ else :
111+ array_condition = ic .ManifestSplitCondition .path_matches (array .path )
112+ dimnames = array .metadata .dimension_names or (None ,) * array .ndim
113+ dimsize_axis_names = draw (
114+ st .lists (
115+ st .sampled_from (
116+ tuple (zip (array .shape , range (array .ndim ), dimnames , strict = False ))
117+ ),
118+ min_size = 1 ,
119+ unique = True ,
120+ )
121+ )
122+ for size , axis , dimname in dimsize_axis_names :
123+ if dimname is None or draw (st .booleans ()):
124+ key = ic .ManifestSplitDimCondition .Axis (axis )
125+ else :
126+ key = ic .ManifestSplitDimCondition .DimensionName (dimname )
127+ config_dict [array_condition ] = {
128+ key : draw (st .integers (min_value = 1 , max_value = size + 10 ))
129+ }
130+ return ic .ManifestSplittingConfig .from_dict (config_dict )
131+
132+
42133# TODO: more before/after commit invariants?
43134# TODO: add "/" to self.all_groups, deleting "/" seems to be problematic
44135class ModifiedZarrHierarchyStateMachine (ZarrHierarchyStateMachine ):
45- def __init__ (self , repo : Repository ) -> None :
46- self .repo = repo
47- store = repo .writable_session ("main" ).store
136+ def __init__ (self , storage : Storage ) -> None :
137+ self .storage = storage
138+ self .repo = Repository .create (self .storage )
139+ store = self .repo .writable_session ("main" ).store
48140 super ().__init__ (store )
49141
142+ @precondition (
143+ lambda self : not self .store .session .has_uncommitted_changes
144+ and bool (self .all_arrays )
145+ )
146+ @rule (data = st .data ())
147+ def reopen_with_config (self , data ):
148+ array_paths = data .draw (
149+ st .lists (st .sampled_from (sorted (self .all_arrays )), max_size = 3 , unique = True )
150+ )
151+ arrays = tuple (zarr .open_array (self .model , path = path ) for path in array_paths )
152+ sconfig = data .draw (splitting_configs (arrays = arrays ))
153+ config = ic .RepositoryConfig (
154+ inline_chunk_threshold_bytes = 0 , manifest = ic .ManifestConfig (splitting = sconfig )
155+ )
156+ note (f"reopening with splitting config { sconfig = !r} " )
157+ self .repo = Repository .open (self .storage , config = config )
158+ self .store = self .repo .writable_session ("main" ).store
159+
50160 @precondition (lambda self : self .store .session .has_uncommitted_changes )
51161 @rule (data = st .data ())
52162 def commit_with_check (self , data ) -> None :
@@ -108,8 +218,49 @@ def add_array(
108218 assume (array .size > 0 )
109219 super ().add_array (data , name , array_and_chunks )
110220
221+ @precondition (lambda self : bool (self .all_groups ))
222+ @rule (data = st .data ())
223+ def check_list_dir (self , data : st .DataObject ) -> None :
224+ path = self .draw_directory (data )
225+ note (f"list_dir for { path = !r} " )
226+ model_ls = sorted (self ._sync_iter (self .model .list_dir (path )))
227+ store_ls = sorted (self ._sync_iter (self .store .list_dir (path )))
228+ if model_ls != store_ls and set (model_ls ).symmetric_difference (set (store_ls )) != {
229+ "c"
230+ }:
231+ # Consider .list_dir("path/to/array") for an array with a single chunk.
232+ # The MemoryStore model will return `"c", "zarr.json"` only if the chunk exists
233+ # If that chunk was deleted, then `"c"` is not returned.
234+ # LocalStore will not have this behaviour :/
235+ # In Icechunk, we always return the `c` so ignore this inconsistency.
236+ assert model_ls == store_ls , (model_ls , store_ls )
237+
111238 ##### TODO: port everything below to zarr
239+ @precondition (lambda self : bool (self .all_arrays ))
240+ @rule (data = st .data ())
241+ def check_array (self , data : st .DataObject ) -> None :
242+ path = data .draw (st .sampled_from (sorted (self .all_arrays )))
243+ actual = zarr .open_array (self .store , path = path )[:]
244+ expected = zarr .open_array (self .model , path = path )[:]
245+ np .testing .assert_equal (actual , expected )
246+
247+ @precondition (lambda self : bool (self .all_arrays ))
248+ @rule (data = st .data ())
249+ def overwrite_array_orthogonal_indexing (self , data : st .DataObject ) -> None :
250+ array = data .draw (st .sampled_from (sorted (self .all_arrays )))
251+ model_array = zarr .open_array (path = array , store = self .model )
252+ store_array = zarr .open_array (path = array , store = self .store )
253+ indexer , _ = data .draw (orthogonal_indices (shape = model_array .shape ))
254+ note (f"overwriting array orthogonal { indexer = } " )
255+ new_data = data .draw (
256+ npst .arrays (shape = model_array .oindex [indexer ].shape , dtype = model_array .dtype )
257+ )
258+ model_array .oindex [indexer ] = new_data
259+ store_array .oindex [indexer ] = new_data
260+
261+ ##### TODO: delete after next Zarr release (Jun 18, 2025)
112262 @rule ()
263+ @with_frequency (0.25 )
113264 def clear (self ) -> None :
114265 note ("clearing" )
115266 import zarr
@@ -152,23 +303,6 @@ def draw_directory(self, data) -> str:
152303 path = array_or_group
153304 return path
154305
155- @precondition (lambda self : bool (self .all_groups ))
156- @rule (data = st .data ())
157- def check_list_dir (self , data ) -> None :
158- path = self .draw_directory (data )
159- note (f"list_dir for { path = !r} " )
160- model_ls = sorted (self ._sync_iter (self .model .list_dir (path )))
161- store_ls = sorted (self ._sync_iter (self .store .list_dir (path )))
162- if model_ls != store_ls and set (model_ls ).symmetric_difference (set (store_ls )) != {
163- "c"
164- }:
165- # Consider .list_dir("path/to/array") for an array with a single chunk.
166- # The MemoryStore model will return `"c", "zarr.json"` only if the chunk exists
167- # If that chunk was deleted, then `"c"` is not returned.
168- # LocalStore will not have this behaviour :/
169- # In Icechunk, we always return the `c` so ignore this inconsistency.
170- assert model_ls == store_ls , (model_ls , store_ls )
171-
172306 @precondition (lambda self : bool (self .all_arrays ))
173307 @rule (data = st .data ())
174308 def delete_chunk (self , data ) -> None :
@@ -182,6 +316,32 @@ def delete_chunk(self, data) -> None:
182316 self ._sync (self .model .delete (path ))
183317 self ._sync (self .store .delete (path ))
184318
319+ @precondition (lambda self : bool (self .all_arrays ))
320+ @rule (data = st .data ())
321+ def overwrite_array_basic_indexing (self , data ) -> None :
322+ array = data .draw (st .sampled_from (sorted (self .all_arrays )))
323+ model_array = zarr .open_array (path = array , store = self .model )
324+ store_array = zarr .open_array (path = array , store = self .store )
325+ slicer = data .draw (basic_indices (shape = model_array .shape ))
326+ note (f"overwriting array basic { slicer = } " )
327+ new_data = data .draw (
328+ npst .arrays (shape = model_array [slicer ].shape , dtype = model_array .dtype )
329+ )
330+ model_array [slicer ] = new_data
331+ store_array [slicer ] = new_data
332+
333+ @precondition (lambda self : bool (self .all_arrays ))
334+ @rule (data = st .data ())
335+ def resize_array (self , data ) -> None :
336+ array = data .draw (st .sampled_from (sorted (self .all_arrays )))
337+ model_array = zarr .open_array (path = array , store = self .model )
338+ store_array = zarr .open_array (path = array , store = self .store )
339+ ndim = model_array .ndim
340+ new_shape = data .draw (npst .array_shapes (max_dims = ndim , min_dims = ndim , min_side = 1 ))
341+ note (f"resizing array from { model_array .shape } to { new_shape } " )
342+ model_array .resize (new_shape )
343+ store_array .resize (new_shape )
344+
185345 @precondition (lambda self : bool (self .all_arrays ) or bool (self .all_groups ))
186346 @rule (data = st .data ())
187347 def delete_dir (self , data ) -> None :
@@ -219,10 +379,8 @@ def check_list_prefix_from_root(self) -> None:
219379
220380
221381def test_zarr_hierarchy () -> None :
222- repo = Repository .create (in_memory_storage ())
223-
224382 def mk_test_instance_sync () -> ModifiedZarrHierarchyStateMachine :
225- return ModifiedZarrHierarchyStateMachine (repo )
383+ return ModifiedZarrHierarchyStateMachine (in_memory_storage () )
226384
227385 run_state_machine_as_test (
228386 mk_test_instance_sync , settings = Settings (report_multiple_bugs = False )
0 commit comments