From 4b30b9f25aa1349ace708227c28b5e3dc6ec56f9 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Mon, 12 May 2025 21:01:03 -0600
Subject: [PATCH 01/43] [WIP] Optimize split manifest writes.

---
 icechunk/src/change_set.rs             | 236 ++++++++--
 icechunk/src/conflicts/detector.rs     |   1 -
 icechunk/src/format/manifest.rs        |  48 +-
 icechunk/src/format/transaction_log.rs |   1 -
 icechunk/src/repository.rs             |  26 +-
 icechunk/src/session.rs                | 622 ++++++++++++++++++-------
 icechunk/src/strategies.rs             |  14 +-
 7 files changed, 708 insertions(+), 240 deletions(-)
diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index 8a5f880be..d42830b98 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -1,7 +1,6 @@
 use std::{
     collections::{BTreeMap, HashMap, HashSet},
     iter,
-    mem::take,
 };
 
 use bytes::Bytes;
@@ -9,9 +8,10 @@ use itertools::{Either, Itertools as _};
 use serde::{Deserialize, Serialize};
 
 use crate::{
+    config::ManifestSplittingConfig,
     format::{
         ChunkIndices, NodeId, Path,
-        manifest::{ChunkInfo, ChunkPayload},
+        manifest::{ChunkInfo, ChunkPayload, ManifestExtents, ManifestSplits},
         snapshot::{ArrayShape, DimensionName, NodeData, NodeSnapshot},
     },
     session::SessionResult,
@@ -24,20 +24,89 @@ pub struct ArrayData {
     pub user_data: Bytes,
 }
 
+impl ManifestSplits {
+    pub fn which_extent(&self, coord: &ChunkIndices) -> SessionResult<&ManifestExtents> {
+        Ok(self.0.get(self.which(coord)?).expect(&format!(
+            "logic bug, could not find ManifestExtents for this coordinate: {:?}",
+            coord
+        )))
+    }
+}
+
+#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
+pub struct SplitManifest {
+    from: Vec<u32>,
+    to: Vec<u32>,
+    // It's important we keep these sorted, we use this fact in TransactionLog creation
+    chunks: BTreeMap<ChunkIndices, Option<ChunkPayload>>,
+}
+
+impl SplitManifest {
+    pub fn update(&mut self, coord: ChunkIndices, data: Option<ChunkPayload>) {
+        if self.from.is_empty() {
+            debug_assert!(self.to.is_empty());
+            debug_assert!(self.chunks.is_empty());
+            // important to remember that `to` is not inclusive, so we need +1
+            let mut coord0 = coord.0.clone();
+            self.to.extend(coord0.iter().map(|n| *n + 1));
+            self.from.append(&mut coord0);
+        } else {
+            for (existing, coord0) in self.from.iter_mut().zip(coord.0.iter()) {
+                if coord0 < existing {
+                    *existing = *coord0
+                }
+            }
+            for (existing, coord0) in self.to.iter_mut().zip(coord.0.iter()) {
+                // important to remember that `to` is not inclusive, so we need +1
+                let range_value = coord0 + 1;
+                if range_value > *existing {
+                    *existing = range_value
+                }
+            }
+        }
+        self.chunks.insert(coord, data);
+    }
+
+    pub fn retain(&mut self, predicate: impl Fn(&ChunkIndices) -> bool) {
+        self.chunks.retain(|coord, _| {
+            if !predicate(coord) {
+                // FIXME: handle from, to updating
+                todo!();
+            } else {
+                false
+            }
+        })
+    }
+
+    pub fn extents(&self) -> ManifestExtents {
+        ManifestExtents::new(self.from.as_slice(), self.to.as_slice())
+    }
+}
+
 #[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
 pub struct ChangeSet {
+    // splitting configuration is recorded at the time the writable session is created
+    // we ignore any succeeding changes in repository config.
+    splitting: ManifestSplittingConfig,
+    // This is an optimization so that we needn't figure out the split sizes on every set.
+    // TODO: consider merging with `set_chunks` BTreeMap
+    splits: HashMap<NodeId, ManifestSplits>,
     new_groups: HashMap<Path, (NodeId, Bytes)>,
     new_arrays: HashMap<Path, (NodeId, ArrayData)>,
     updated_arrays: HashMap<NodeId, ArrayData>,
     updated_groups: HashMap<NodeId, Bytes>,
-    // It's important we keep these sorted, we use this fact in TransactionLog creation
-    // TODO: could track ManifestExtents
-    set_chunks: BTreeMap<NodeId, BTreeMap<ChunkIndices, Option<ChunkPayload>>>,
+    // FIXME: It's important we keep these sorted, we use this fact in TransactionLog creation
+    //        Change HashMap -> BTreeMap, need to check Ord on ManifestExtents
+    set_chunks: BTreeMap<NodeId, HashMap<ManifestExtents, SplitManifest>>,
     deleted_groups: HashSet<(Path, NodeId)>,
     deleted_arrays: HashSet<(Path, NodeId)>,
 }
 
 impl ChangeSet {
+    pub fn new(splitting: ManifestSplittingConfig) -> Self {
+        Self { splitting, ..Default::default() }
+    }
+
     pub fn deleted_arrays(&self) -> impl Iterator<Item = &(Path, NodeId)> {
         self.deleted_arrays.iter()
     }
@@ -58,11 +127,16 @@ impl ChangeSet {
         self.deleted_arrays.contains(path_and_id)
     }
 
+    pub fn splits(&self, id: &NodeId) -> Option<&ManifestSplits> {
+        self.splits.get(id)
+    }
+
     pub fn chunk_changes(
         &self,
-    ) -> impl Iterator<Item = (&NodeId, &BTreeMap<ChunkIndices, Option<ChunkPayload>>)>
-    {
-        self.set_chunks.iter()
+    ) -> impl Iterator<Item = (&NodeId, impl Iterator<Item = &ChunkIndices>)> {
+        self.set_chunks.iter().map(|(node_id, split_map)| {
+            (node_id, split_map.values().flat_map(|x| x.chunks.keys()))
+        })
     }
 
     pub fn has_chunk_changes(&self, node: &NodeId) -> bool {
@@ -70,7 +144,7 @@ impl ChangeSet {
     }
 
     pub fn arrays_with_chunk_changes(&self) -> impl Iterator<Item = &NodeId> {
-        self.chunk_changes().map(|(node, _)| node)
+        self.set_chunks.iter().map(|(node, _)| node)
     }
 
     pub fn is_empty(&self) -> bool {
@@ -100,11 +174,39 @@ impl ChangeSet {
         }
     }
 
+    fn maybe_update_cached_splits(
+        &mut self,
+        node_id: &NodeId,
+        path: &Path,
+        shape: &ArrayShape,
+        dimension_names: &Option<Vec<DimensionName>>,
+    ) {
+        if !self.splits.contains_key(node_id) {
+            // Q: What happens if we set a chunk, then change a dimension name, so
+            //   that the split changes.
+            // A: We ignore it. splits are set once for a node in a session, and are never changed.
+            let splits = self.splitting.get_split_sizes(path, shape, dimension_names);
+            self.splits.insert(node_id.clone(), splits);
+        }
+    }
+
     pub fn add_array(&mut self, path: Path, node_id: NodeId, array_data: ArrayData) {
+        self.maybe_update_cached_splits(
+            &node_id,
+            &path,
+            &array_data.shape,
+            &array_data.dimension_names,
+        );
         self.new_arrays.insert(path, (node_id, array_data));
     }
 
     pub fn update_array(&mut self, node_id: &NodeId, path: &Path, array_data: ArrayData) {
+        self.maybe_update_cached_splits(
+            &node_id,
+            &path,
+            &array_data.shape,
+            &array_data.dimension_names,
+        );
         match self.new_arrays.get(path) {
             Some((id, _)) => {
                 debug_assert!(!self.updated_arrays.contains_key(id));
@@ -139,6 +241,7 @@ impl ChangeSet {
 
         self.updated_arrays.remove(node_id);
         self.set_chunks.remove(node_id);
+        self.splits.remove(node_id);
         if !is_new_array {
             self.deleted_arrays.insert((path, node_id.clone()));
         }
@@ -167,14 +270,30 @@ impl ChangeSet {
         coord: ChunkIndices,
         data: Option<ChunkPayload>,
     ) {
+        let cached_splits = self.splits.get(&node_id).expect(&format!(
+            "logic bug. change_set.splits should be populated for node {}",
+            node_id
+        ));
+
+        let extent = cached_splits.which_extent(&coord).expect("logic bug. Trying to set chunk ref but can't find the appropriate split manifest.");
         // this implementation makes delete idempotent
         // it allows deleting a deleted chunk by repeatedly setting None.
         self.set_chunks
             .entry(node_id)
             .and_modify(|h| {
-                h.insert(coord.clone(), data.clone());
+                h.entry(extent.clone()).or_default().update(coord.clone(), data.clone());
             })
-            .or_insert(BTreeMap::from([(coord, data)]));
+            .or_insert_with(|| {
+                let mut h = HashMap::<ManifestExtents, SplitManifest>::with_capacity(
+                    cached_splits.len(),
+                );
+                h.entry(extent.clone())
+                    // TODO: this is duplicative. I can't use `or_default` because it's
+                    // nice to create the HashMap using `with_capacity`
+                    .or_default()
+                    .update(coord, data);
+                h
+            });
     }
 
     pub fn get_chunk_ref(
@@ -182,7 +301,17 @@ impl ChangeSet {
         node_id: &NodeId,
         coords: &ChunkIndices,
     ) -> Option<&Option<ChunkPayload>> {
-        self.set_chunks.get(node_id).and_then(|h| h.get(coords))
+        self.splits
+            .get(node_id)
+            .and_then(|splits| {
+                splits.which_extent(coords).ok().map(|extent| {
+                    self.set_chunks
+                        .get(node_id)
+                        .and_then(|h| h.get(&extent))
+                        .and_then(|s| s.chunks.get(coords))
+                })
+            })
+            .flatten()
     }
 
     /// Drop the updated chunk references for the node.
@@ -190,10 +319,12 @@ impl ChangeSet {
     pub fn drop_chunk_changes(
         &mut self,
         node_id: &NodeId,
-        predicate: impl Fn(&ChunkIndices) -> bool,
+        predicate: impl Fn(&ChunkIndices) -> bool + Copy,
     ) {
         if let Some(changes) = self.set_chunks.get_mut(node_id) {
-            changes.retain(|coord, _| !predicate(coord));
+            for split in changes.values_mut() {
+                split.retain(predicate);
+            }
         }
     }
 
@@ -201,13 +332,21 @@ impl ChangeSet {
         &self,
         node_id: &NodeId,
         node_path: &Path,
+        extent: Option<ManifestExtents>,
     ) -> impl Iterator<Item = (&ChunkIndices, &Option<ChunkPayload>)> + use<'_> {
         if self.is_deleted(node_path, node_id) {
             return Either::Left(iter::empty());
         }
         match self.set_chunks.get(node_id) {
             None => Either::Left(iter::empty()),
-            Some(h) => Either::Right(h.iter()),
+            Some(h) => Either::Right(
+                h.iter()
+                    // FIXME: review this
+                    .filter(move |(manifest_extent, _)| {
+                        extent.is_none() || Some(*manifest_extent) == extent.as_ref()
+                    })
+                    .flat_map(|(_, manifest)| manifest.chunks.iter()),
+            ),
         }
     }
 
@@ -224,7 +363,7 @@ impl ChangeSet {
         node_id: &'a NodeId,
         node_path: &Path,
     ) -> impl Iterator<Item = ChunkInfo> + use<'a> {
-        self.array_chunks_iterator(node_id, node_path).filter_map(
+        self.array_chunks_iterator(node_id, node_path, None).filter_map(
             move |(coords, payload)| {
                 payload.as_ref().map(|p| ChunkInfo {
                     node: node_id.clone(),
@@ -235,6 +374,47 @@ impl ChangeSet {
         )
     }
 
+    pub fn array_manifests_iterator(
+        &self,
+        node_id: &NodeId,
+        node_path: &Path,
+    ) -> impl Iterator<Item = (&ManifestExtents, &SplitManifest)> + use<'_> {
+        if self.is_deleted(node_path, node_id) {
+            return Either::Left(iter::empty());
+        }
+        match self.set_chunks.get(node_id) {
+            None => Either::Left(iter::empty()),
+            Some(h) => Either::Right(h.iter()),
+        }
+    }
+
+    pub fn array_manifest(
+        &self,
+        node_id: &NodeId,
+        extent: &ManifestExtents,
+    ) -> Option<&SplitManifest> {
+        self.set_chunks.get(node_id).and_then(|x| x.get(extent))
+    }
+
+    /// Iterator over chunks for a new array for a given ManifestExtents
+    pub fn new_array_manifest_chunks_iterator<'a>(
+        &'a self,
+        node_id: &'a NodeId,
+        extent: &ManifestExtents,
+    ) -> impl Iterator<Item = ChunkInfo> + use<'a> {
+        if let Some(manifest) = self.array_manifest(node_id, extent) {
+            Either::Right(manifest.chunks.iter().filter_map(move |(coords, payload)| {
+                payload.as_ref().map(|p| ChunkInfo {
+                    node: node_id.clone(),
+                    coord: coords.clone(),
+                    payload: p.clone(),
+                })
+            }))
+        } else {
+            Either::Left(iter::empty())
+        }
+    }
+
     pub fn new_nodes(&self) -> impl Iterator<Item = (&Path, &NodeId)> {
         self.new_groups().chain(self.new_arrays())
     }
@@ -247,24 +427,25 @@ impl ChangeSet {
         self.new_arrays.iter().map(|(path, (node_id, _))| (path, node_id))
     }
 
-    pub fn take_chunks(
-        &mut self,
-    ) -> BTreeMap<NodeId, BTreeMap<ChunkIndices, Option<ChunkPayload>>> {
-        take(&mut self.set_chunks)
-    }
+    // pub fn take_chunks(
+    //     &mut self,
+    // ) -> BTreeMap<NodeId, BTreeMap<ChunkIndices, Option<ChunkPayload>>> {
+    //     take(&mut self.set_chunks)
+    // }
 
-    pub fn set_chunks(
-        &mut self,
-        chunks: BTreeMap<NodeId, BTreeMap<ChunkIndices, Option<ChunkPayload>>>,
-    ) {
-        self.set_chunks = chunks
-    }
+    // pub fn set_chunks(
+    //     &mut self,
+    //     chunks: BTreeMap<NodeId, BTreeMap<ChunkIndices, Option<ChunkPayload>>>,
+    // ) {
+    //     self.set_chunks = chunks
+    // }
 
     /// Merge this ChangeSet with `other`.
     ///
     /// Results of the merge are applied to `self`. Changes present in `other` take precedence over
     /// `self` changes.
     pub fn merge(&mut self, other: ChangeSet) {
+        // FIXME: what do I do with splitting and splits here.
         // FIXME: this should detect conflict, for example, if different writers added on the same
         // path, different objects, or if the same path is added and deleted, etc.
         // TODO: optimize
@@ -274,6 +455,7 @@ impl ChangeSet {
         self.updated_arrays.extend(other.updated_arrays);
         self.deleted_groups.extend(other.deleted_groups);
         self.deleted_arrays.extend(other.deleted_arrays);
+        // FIXME: handle splits
 
         for (node, other_chunks) in other.set_chunks.into_iter() {
             match self.set_chunks.remove(&node) {
diff --git a/icechunk/src/conflicts/detector.rs b/icechunk/src/conflicts/detector.rs
index 3de442e62..d2c4cef24 100644
--- a/icechunk/src/conflicts/detector.rs
+++ b/icechunk/src/conflicts/detector.rs
@@ -145,7 +145,6 @@ impl ConflictSolver for ConflictDetector {
                     None
                 } else {
                     let conflicting: HashSet<_> = changes
-                        .keys()
                         .filter(|coord| previous_changes.contains(coord))
                         .cloned()
                         .collect();
diff --git a/icechunk/src/format/manifest.rs b/icechunk/src/format/manifest.rs
index ea03d32d3..f9e89009d 100644
--- a/icechunk/src/format/manifest.rs
+++ b/icechunk/src/format/manifest.rs
@@ -1,10 +1,16 @@
-use std::{borrow::Cow, convert::Infallible, ops::Range, sync::Arc};
+use std::{
+    borrow::Cow,
+    cmp::{max, min},
+    convert::Infallible,
+    ops::Range,
+    sync::Arc,
+};
 
 use crate::format::flatbuffers::generated;
 use bytes::Bytes;
 use flatbuffers::VerifierOptions;
 use futures::{Stream, TryStreamExt};
-use itertools::{Itertools, multiunzip};
+use itertools::{Itertools, any, multiunzip};
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
 
@@ -18,15 +24,9 @@ use super::{
     ChunkId, ChunkIndices, ChunkLength, ChunkOffset, IcechunkResult, ManifestId, NodeId,
 };
 
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Hash, PartialEq, Eq, Serialize, Deserialize)]
 pub struct ManifestExtents(Vec<Range<u32>>);
 
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-pub struct ManifestRef {
-    pub object_id: ManifestId,
-    pub extents: ManifestExtents,
-}
-
 impl ManifestExtents {
     pub fn new(from: &[u32], to: &[u32]) -> Self {
         let v = from
@@ -37,6 +37,10 @@ impl ManifestExtents {
         Self(v)
     }
 
+    pub fn from_ranges_iter(ranges: impl IntoIterator<Item = Range<u32>>) -> Self {
+        Self(ranges.into_iter().collect())
+    }
+
     pub fn contains(&self, coord: &[u32]) -> bool {
         self.iter().zip(coord.iter()).all(|(range, that)| range.contains(that))
     }
@@ -52,10 +56,32 @@ impl ManifestExtents {
     pub fn is_empty(&self) -> bool {
         self.0.is_empty()
     }
+
+    pub fn intersection(&self, other: &Self) -> Option<Self> {
+        debug_assert_eq!(self.len(), other.len());
+        let ranges = std::iter::zip(self.iter(), other.iter())
+            .map(|(a, b)| max(a.start, b.start)..min(a.end, b.end))
+            .collect::<Vec<_>>();
+        if any(ranges.iter(), |r| r.end < r.start) { None } else { Some(Self(ranges)) }
+    }
+
+    pub fn union(&self, other: &Self) -> Self {
+        debug_assert_eq!(self.len(), other.len());
+        Self::from_ranges_iter(
+            std::iter::zip(self.iter(), other.iter())
+                .map(|(a, b)| min(a.start, b.start)..max(a.end, b.end)),
+        )
+    }
 }
 
-#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
-pub struct ManifestSplits(Vec<ManifestExtents>);
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct ManifestRef {
+    pub object_id: ManifestId,
+    pub extents: ManifestExtents,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct ManifestSplits(pub Vec<ManifestExtents>);
 
 impl ManifestSplits {
     /// Used at read-time
diff --git a/icechunk/src/format/transaction_log.rs b/icechunk/src/format/transaction_log.rs
index ee064b66f..06324b833 100644
--- a/icechunk/src/format/transaction_log.rs
+++ b/icechunk/src/format/transaction_log.rs
@@ -47,7 +47,6 @@ impl TransactionLog {
                 let node_id = generated::ObjectId8::new(&node_id.0);
                 let node_id = Some(&node_id);
                 let chunks = chunks
-                    .keys()
                     .map(|indices| {
                         let coords = Some(builder.create_vector(indices.0.as_slice()));
                         generated::ChunkIndices::create(
diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs
index d7652a815..54668c50c 100644
--- a/icechunk/src/repository.rs
+++ b/icechunk/src/repository.rs
@@ -1380,24 +1380,15 @@ mod tests {
             ),
         ];
         let split_config = ManifestSplittingConfig { split_sizes: Some(split_sizes) };
-        let repo = create_repo_with_split_manifest_config(
-            &temp_path,
-            &shape,
-            &dimension_names,
-            &split_config,
-            None,
-        )
-        .await?;
 
-        let session = repo.writable_session("main").await?;
-        let actual =
-            split_config.get_split_sizes(&session.get_node(&temp_path).await?)?;
         let expected = ManifestSplits::from_edges(vec![
             vec![0, 12, 24, 25],
             vec![0, 9, 10],
             vec![0, 2, 3],
             vec![0, 4],
         ]);
+
+        let actual = split_config.get_split_sizes(&temp_path, &shape, &dimension_names);
         assert_eq!(actual, expected);
 
         let split_sizes = vec![(
@@ -1418,18 +1409,7 @@ mod tests {
             ],
         )];
         let split_config = ManifestSplittingConfig { split_sizes: Some(split_sizes) };
-        let repo = create_repo_with_split_manifest_config(
-            &temp_path,
-            &shape,
-            &dimension_names,
-            &split_config,
-            None,
-        )
-        .await?;
-
-        let session = repo.writable_session("main").await?;
-        let actual =
-            split_config.get_split_sizes(&session.get_node(&temp_path).await?)?;
+        let actual = split_config.get_split_sizes(&temp_path, &shape, &dimension_names);
         assert_eq!(actual, expected);
 
         Ok(())
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 8a4c257b2..8175242a4 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -3,6 +3,7 @@ use std::{
     collections::{HashMap, HashSet},
     convert::Infallible,
     future::{Future, ready},
+    iter::zip,
     ops::Range,
     pin::Pin,
     sync::Arc,
@@ -188,6 +189,40 @@ impl ManifestSplits {
     }
 }
 
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum Overlap {
+    Complete,
+    Partial,
+    None,
+}
+
+/// Important: this is not symmetric.
+pub fn overlaps(us: &ManifestExtents, them: &ManifestExtents) -> Overlap {
+    debug_assert!(us.len() == them.len());
+
+    let mut overlaps = vec![];
+    for (a, b) in zip(us.iter(), them.iter()) {
+        debug_assert!(a.start <= a.end, "Invalid range: {:?}", a.clone());
+        debug_assert!(b.start <= b.end, "Invalid range: {:?}", b.clone());
+
+        if (a.start <= b.start) && (a.end >= b.end) {
+            overlaps.push(Overlap::Complete);
+        } else if (a.end <= b.start) || (a.start >= b.end) {
+            overlaps.push(Overlap::None);
+        } else {
+            overlaps.push(Overlap::Partial)
+        }
+    }
+
+    if overlaps.iter().all(|x| x == &Overlap::Complete) {
+        return Overlap::Complete;
+    } else if overlaps.iter().any(|x| x == &Overlap::None) {
+        return Overlap::None;
+    } else {
+        return Overlap::Partial;
+    }
+}
+
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct Session {
     config: RepositoryConfig,
@@ -234,6 +269,7 @@ impl Session {
         snapshot_id: SnapshotId,
         default_commit_metadata: SnapshotProperties,
     ) -> Self {
+        let splitting = config.manifest().splitting().clone();
         Self {
             config,
             storage_settings: Arc::new(storage_settings),
@@ -242,7 +278,7 @@ impl Session {
             virtual_resolver,
             branch_name: Some(branch_name),
             snapshot_id,
-            change_set: ChangeSet::default(),
+            change_set: ChangeSet::new(splitting),
             default_commit_metadata,
         }
     }
@@ -560,6 +596,7 @@ impl Session {
             &self.change_set,
             &self.snapshot_id,
             path,
+            None,
         )
         .await
     }
@@ -800,6 +837,7 @@ impl Session {
             &self.change_set,
             &self.snapshot_id,
             node.clone(),
+            None,
         )
         .await
         .map_ok(|(_path, chunk_info)| chunk_info.coord);
@@ -1112,8 +1150,14 @@ async fn updated_chunk_iterator<'a>(
     let snapshot = asset_manager.fetch_snapshot(snapshot_id).await?;
     let nodes = futures::stream::iter(snapshot.iter_arc());
     let res = nodes.and_then(move |node| async move {
-        Ok(updated_node_chunks_iterator(asset_manager, change_set, snapshot_id, node)
-            .await)
+        Ok(updated_node_chunks_iterator(
+            asset_manager,
+            change_set,
+            snapshot_id,
+            node,
+            None,
+        )
+        .await)
     });
     Ok(res.try_flatten())
 }
@@ -1123,6 +1167,7 @@ async fn updated_node_chunks_iterator<'a>(
     change_set: &'a ChangeSet,
     snapshot_id: &'a SnapshotId,
     node: NodeSnapshot,
+    extent: Option<ManifestExtents>,
 ) -> impl Stream<Item = SessionResult<(Path, ChunkInfo)>> + 'a {
     // This iterator should yield chunks for existing arrays + any updates.
     // we check for deletion here in the case that `path` exists in the snapshot,
@@ -1133,9 +1178,15 @@ async fn updated_node_chunks_iterator<'a>(
         let path = node.path.clone();
         Either::Right(
             // TODO: avoid clone
-            verified_node_chunk_iterator(asset_manager, snapshot_id, change_set, node)
-                .await
-                .map_ok(move |ci| (path.clone(), ci)),
+            verified_node_chunk_iterator(
+                asset_manager,
+                snapshot_id,
+                change_set,
+                node,
+                extent,
+            )
+            .await
+            .map_ok(move |ci| (path.clone(), ci)),
         )
     }
 }
@@ -1146,11 +1197,18 @@ async fn node_chunk_iterator<'a>(
     change_set: &'a ChangeSet,
     snapshot_id: &'a SnapshotId,
     path: &Path,
+    extent: Option<ManifestExtents>,
 ) -> impl Stream<Item = SessionResult<ChunkInfo>> + 'a + use<'a> {
     match get_node(asset_manager, change_set, snapshot_id, path).await {
         Ok(node) => futures::future::Either::Left(
-            verified_node_chunk_iterator(asset_manager, snapshot_id, change_set, node)
-                .await,
+            verified_node_chunk_iterator(
+                asset_manager,
+                snapshot_id,
+                change_set,
+                node,
+                extent,
+            )
+            .await,
         ),
         Err(_) => futures::future::Either::Right(futures::stream::empty()),
     }
@@ -1162,20 +1220,22 @@ async fn verified_node_chunk_iterator<'a>(
     snapshot_id: &'a SnapshotId,
     change_set: &'a ChangeSet,
     node: NodeSnapshot,
+    extent: Option<ManifestExtents>,
 ) -> impl Stream<Item = SessionResult<ChunkInfo>> + 'a {
     match node.node_data {
         NodeData::Group => futures::future::Either::Left(futures::stream::empty()),
         NodeData::Array { manifests, .. } => {
             let new_chunk_indices: Box<HashSet<&ChunkIndices>> = Box::new(
                 change_set
-                    .array_chunks_iterator(&node.id, &node.path)
+                    .array_chunks_iterator(&node.id, &node.path, extent.clone())
                     .map(|(idx, _)| idx)
                     .collect(),
             );
 
             let node_id_c = node.id.clone();
+            let extent_c = extent.clone();
             let new_chunks = change_set
-                .array_chunks_iterator(&node.id, &node.path)
+                .array_chunks_iterator(&node.id, &node.path, extent.clone())
                 .filter_map(move |(idx, payload)| {
                     payload.as_ref().map(|payload| {
                         Ok(ChunkInfo {
@@ -1189,11 +1249,17 @@ async fn verified_node_chunk_iterator<'a>(
             futures::future::Either::Right(
                 futures::stream::iter(new_chunks).chain(
                     futures::stream::iter(manifests)
+                        .filter(move |manifest_ref| {
+                            futures::future::ready(extent.as_ref().is_none_or(|e| {
+                                overlaps(&manifest_ref.extents, &e) != Overlap::None
+                            }))
+                        })
                         .then(move |manifest_ref| {
                             let new_chunk_indices = new_chunk_indices.clone();
                             let node_id_c = node.id.clone();
                             let node_id_c2 = node.id.clone();
                             let node_id_c3 = node.id.clone();
+                            let extent_c2 = extent_c.clone();
                             async move {
                                 let manifest = fetch_manifest(
                                     &manifest_ref.object_id,
@@ -1207,6 +1273,13 @@ async fn verified_node_chunk_iterator<'a>(
                                             .iter(node_id_c.clone())
                                             .filter_ok(move |(coord, _)| {
                                                 !new_chunk_indices.contains(coord)
+                                                    // If the manifest we are parsing partially overlaps with `extent`,
+                                                    // we need to filter all coordinates
+                                                    && extent_c2.as_ref().is_none_or(
+                                                        move |e| {
+                                                            e.contains(coord.0.as_slice())
+                                                        },
+                                                    )
                                             })
                                             .map_ok(move |(coord, payload)| ChunkInfo {
                                                 node: node_id_c2.clone(),
@@ -1214,6 +1287,7 @@ async fn verified_node_chunk_iterator<'a>(
                                                 payload,
                                             });
 
+                                        // FIXME: I don't understand this
                                         let old_chunks = change_set
                                             .update_existing_chunks(
                                                 node_id_c3, old_chunks,
@@ -1444,41 +1518,6 @@ pub fn construct_valid_byte_range(
     }
 }
 
-#[derive(Default, Debug)]
-struct SplitManifest {
-    from: Vec<u32>,
-    to: Vec<u32>,
-    chunks: Vec<ChunkInfo>,
-}
-
-impl SplitManifest {
-    fn update(&mut self, chunk: ChunkInfo) {
-        if self.from.is_empty() {
-            debug_assert!(self.to.is_empty());
-            debug_assert!(self.chunks.is_empty());
-            // important to remember that `to` is not inclusive, so we need +1
-            let mut coord = chunk.coord.0.clone();
-            self.to.extend(coord.iter().map(|n| *n + 1));
-            self.from.append(&mut coord);
-        } else {
-            for (existing, coord) in self.from.iter_mut().zip(chunk.coord.0.iter()) {
-                if coord < existing {
-                    *existing = *coord
-                }
-            }
-            for (existing, coord) in self.to.iter_mut().zip(chunk.coord.0.iter()) {
-                // important to remember that `to` is not inclusive, so we need +1
-                let range_value = coord + 1;
-                if range_value > *existing {
-                    *existing = range_value
-                }
-            }
-        }
-
-        self.chunks.push(chunk)
-    }
-}
-
 struct FlushProcess<'a> {
     asset_manager: Arc<AssetManager>,
     change_set: &'a ChangeSet,
@@ -1505,69 +1544,91 @@ impl<'a> FlushProcess<'a> {
         }
     }
 
-    async fn write_manifests_from_iterator(
+    async fn write_manifest_for_updated_chunks(
         &mut self,
-        node_id: &NodeId,
-        chunks: impl Stream<Item = SessionResult<ChunkInfo>>,
-        splits: ManifestSplits,
-    ) -> SessionResult<()> {
-        // TODO: think about optimizing writes to manifests
-        // TODO: add benchmarks
-        let split_refs = chunks
-            .try_fold(
-                // TODO: have the changeset track this HashMap
-                HashMap::<usize, SplitManifest>::with_capacity(splits.len()),
-                |mut split_refs, chunk| async {
-                    let split_index = splits.which(&chunk.coord);
-                    split_index.map(|index| {
-                        split_refs.entry(index).or_default().update(chunk);
-                        split_refs
-                    })
-                },
-            )
-            .await?;
-
-        for (_, shard) in split_refs.into_iter() {
-            let shard_chunks =
-                stream::iter(shard.chunks.into_iter().map(Ok::<ChunkInfo, Infallible>));
-
-            if let Some(new_manifest) = Manifest::from_stream(shard_chunks).await.unwrap()
-            {
-                let new_manifest = Arc::new(new_manifest);
-                let new_manifest_size =
-                    self.asset_manager.write_manifest(Arc::clone(&new_manifest)).await?;
-
-                let file_info =
-                    ManifestFileInfo::new(new_manifest.as_ref(), new_manifest_size);
-                self.manifest_files.insert(file_info);
-
-                let new_ref = ManifestRef {
-                    object_id: new_manifest.id().clone(),
-                    extents: ManifestExtents::new(&shard.from, &shard.to),
-                };
+        node: &NodeSnapshot,
+        extent: &ManifestExtents,
+        actual_extents: ManifestExtents,
+    ) -> SessionResult<Option<ManifestRef>> {
+        let asset_manager = Arc::clone(&self.asset_manager);
+        let updated_chunks = updated_node_chunks_iterator(
+            asset_manager.as_ref(),
+            self.change_set,
+            self.parent_id,
+            node.clone(),
+            Some(extent.clone()),
+        )
+        .await
+        .map_ok(|(_path, chunk_info)| chunk_info);
+        self.write_manifest_from_iterator(updated_chunks, actual_extents).await
+    }
 
-                self.manifest_refs
-                    .entry(node_id.clone())
-                    .and_modify(|v| v.push(new_ref.clone()))
-                    .or_insert_with(|| vec![new_ref]);
-            }
+    async fn write_manifest_from_iterator(
+        &mut self,
+        chunks: impl Stream<Item = SessionResult<ChunkInfo>>,
+        actual_extents: ManifestExtents,
+    ) -> SessionResult<Option<ManifestRef>> {
+        if let Some(new_manifest) = Manifest::from_stream(chunks).await.unwrap() {
+            let new_manifest = Arc::new(new_manifest);
+            let new_manifest_size =
+                self.asset_manager.write_manifest(Arc::clone(&new_manifest)).await?;
+
+            let file_info =
+                ManifestFileInfo::new(new_manifest.as_ref(), new_manifest_size);
+            self.manifest_files.insert(file_info);
+
+            let new_ref = ManifestRef {
+                object_id: new_manifest.id().clone(),
+                extents: actual_extents,
+            };
+            Ok(Some(new_ref))
+        } else {
+            Ok(None)
         }
+    }
 
+    fn finalize_refs(
+        &mut self,
+        node_id: &NodeId,
+        refs: HashMap<ManifestExtents, ManifestRef>,
+    ) -> SessionResult<()> {
+        for ref_ in refs.into_values() {
+            self.manifest_refs
+                .entry(node_id.clone())
+                .and_modify(|v| v.push(ref_.clone()))
+                .or_insert_with(|| vec![ref_]);
+        }
         Ok(())
     }
-
     /// Write a manifest for a node that was created in this session
     /// It doesn't need to look at previous manifests because the node is new
     async fn write_manifest_for_new_node(
         &mut self,
         node_id: &NodeId,
         node_path: &Path,
-        splits: ManifestSplits,
     ) -> SessionResult<()> {
-        let chunks = stream::iter(
-            self.change_set.new_array_chunk_iterator(node_id, node_path).map(Ok),
-        );
-        self.write_manifests_from_iterator(node_id, chunks, splits).await
+        let splits = self.change_set.splits(node_id).expect(&format!("logic bug, array at {} was added in this changeset, splits should be populated.", node_path));
+        let mut refs =
+            HashMap::<ManifestExtents, ManifestRef>::with_capacity(splits.len());
+
+        // TODO: this could be try_fold with the refs HashMap as state
+        for extent in splits.iter() {
+            let cs_extents = self
+                .change_set
+                .array_manifest(node_id, extent)
+                .expect("logic bug. there should be a manifest for this extent ")
+                .extents();
+
+            let chunks = stream::iter(
+                self.change_set
+                    .new_array_manifest_chunks_iterator(node_id, extent)
+                    .map(Ok),
+            );
+            self.write_manifest_from_iterator(chunks, cs_extents)
+                .await?
+                .map(|new_ref| refs.insert(extent.clone(), new_ref));
+        }
+        self.finalize_refs(node_id, refs)
     }
 
     /// Write a manifest for a node that was modified in this session
@@ -1577,18 +1638,82 @@ impl<'a> FlushProcess<'a> {
         &mut self,
         node: &NodeSnapshot,
         splits: ManifestSplits,
+        manifests: Vec<ManifestRef>,
     ) -> SessionResult<()> {
-        let asset_manager = Arc::clone(&self.asset_manager);
-        let updated_chunks = updated_node_chunks_iterator(
-            asset_manager.as_ref(),
-            self.change_set,
-            self.parent_id,
-            node.clone(),
-        )
-        .await
-        .map_ok(|(_path, chunk_info)| chunk_info);
+        // populate with existing refs, if they are compatiblae
+        let mut refs =
+            HashMap::<ManifestExtents, ManifestRef>::with_capacity(splits.len());
+
+        let on_disk_extents =
+            manifests.iter().map(|m| m.extents.clone()).collect::<Vec<_>>();
+
+        let modified_splits = self
+            .change_set
+            .array_manifests_iterator(&node.id, &node.path)
+            .map(|(extents, _)| extents)
+            .collect::<HashSet<_>>();
+
+        // FIXME: there is an invariant here
+        // ``modified_splits`` (i.e. splits used in this session)
+        // must be a subset of ``splits`` (the splits set in the config)
+
+        // TODO: this should be try_fold with the refs HashMap as state
+        for extent in splits.iter() {
+            let on_disk_bbox = on_disk_extents
+                .iter()
+                .filter_map(|e| e.intersection(extent))
+                .reduce(|a, b| a.union(&b));
+
+            if modified_splits.contains(extent) {
+                // this split was modified in this session, rewrite it completely
+                let cs_extents = self
+                    .change_set
+                    .array_manifest(&node.id, extent)
+                    .expect("logic bug. there should be a manifest for this extent ")
+                    .extents();
+                let actual_extents =
+                    // if there are splits on disk that overlap, then we take that Extents
+                    // and union it with the Extents of the chunks the changeset
+                    on_disk_bbox.map(|x| cs_extents.union(&x))
+                    // if no overlap, then just use the changeset Extents
+                    .unwrap_or(cs_extents);
+                self.write_manifest_for_updated_chunks(&node, extent, actual_extents)
+                    .await?
+                    .map(|new_ref| refs.insert(extent.clone(), new_ref));
+            } else {
+                // split was unmodified in this session. Let's look at the current manifests
+                // and see what we need to do with them
+                for old_ref in manifests.iter() {
+                    // Remember that the extents written to disk are the `from`:`to` ranges
+                    // of populated chunks
+                    match overlaps(&old_ref.extents, extent) {
+                        Overlap::Complete => {
+                            debug_assert!(on_disk_bbox.is_some());
+                            // Just propagate this ref again, no rewriting necessary
+                            refs.insert(extent.clone(), old_ref.clone());
+                        }
+                        Overlap::Partial => {
+                            // the splits have changed, but no refs in this split have been written in this session
+                            // same as `if` block above
+                            debug_assert!(on_disk_bbox.is_some());
+                            self.write_manifest_for_updated_chunks(
+                                &node,
+                                extent,
+                                on_disk_bbox.clone().expect("logic bug in writing manifests from disk for partially overlapping split"),
+                            )
+                            .await?
+                            .map(|new_ref| refs.insert(extent.clone(), new_ref));
+                        }
+                        Overlap::None => {
+                            // Nothing to do
+                        }
+                    };
+                }
+            }
+        }
 
-        self.write_manifests_from_iterator(&node.id, updated_chunks, splits).await
+        self.finalize_refs(&node.id, refs)?;
+        Ok(())
     }
 
     /// Record the previous manifests for an array that was not modified in the session
@@ -1636,69 +1761,65 @@ impl ManifestSplitDimCondition {
 }
 
 impl ManifestSplittingConfig {
-    pub fn get_split_sizes(&self, node: &NodeSnapshot) -> SessionResult<ManifestSplits> {
-        match &node.node_data {
-            NodeData::Group => Err(SessionErrorKind::NotAnArray {
-                node: node.clone(),
-                message: "attempting to split manifest for group".to_string(),
-            }
-            .into()),
-            NodeData::Array { shape, dimension_names, .. } => {
-                let ndim = shape.len();
-                let num_chunks = shape.num_chunks().collect::<Vec<_>>();
-                let mut edges: Vec<Vec<u32>> =
-                    (0..ndim).map(|axis| vec![0, num_chunks[axis]]).collect();
-
-                // This is ugly but necessary to handle:
-                //   - path: *
-                //     manifest-split-size:
-                //     - t : 10
-                //   - path: *
-                //     manifest-split-size:
-                //     - y : 2
-                // which is now identical to:
-                //   - path: *
-                //     manifest-split-size:
-                //     - t : 10
-                //     - y : 2
-                let mut already_matched: HashSet<usize> = HashSet::new();
-
-                #[allow(clippy::expect_used)]
-                let split_sizes = self
-                    .split_sizes
+    pub fn get_split_sizes(
+        &self,
+        path: &Path,
+        shape: &ArrayShape,
+        dimension_names: &Option<Vec<DimensionName>>,
+    ) -> ManifestSplits {
+        let ndim = shape.len();
+        let num_chunks = shape.num_chunks().collect::<Vec<_>>();
+        let mut edges: Vec<Vec<u32>> =
+            (0..ndim).map(|axis| vec![0, num_chunks[axis]]).collect();
+
+        // This is ugly but necessary to handle:
+        //   - path: *
+        //     manifest-split-size:
+        //     - t : 10
+        //   - path: *
+        //     manifest-split-size:
+        //     - y : 2
+        // which is now identical to:
+        //   - path: *
+        //     manifest-split-size:
+        //     - t : 10
+        //     - y : 2
+        let mut already_matched: HashSet<usize> = HashSet::new();
+
+        #[allow(clippy::expect_used)]
+        let split_sizes = self
+            .split_sizes
+            .clone()
+            .or_else(|| Self::default().split_sizes)
+            .expect("logic bug in grabbing split sizes from ManifestSplittingConfig");
+
+        for (condition, dim_specs) in split_sizes.iter() {
+            if condition.matches(path) {
+                let dimension_names = dimension_names
                     .clone()
-                    .or_else(|| Self::default().split_sizes)
-                    .expect("logic bug");
-
-                for (condition, dim_specs) in split_sizes.iter() {
-                    if condition.matches(&node.path) {
-                        let dimension_names = dimension_names.clone().unwrap_or(
-                            repeat_n(DimensionName::NotSpecified, ndim).collect(),
-                        );
-                        for (axis, dimname) in itertools::enumerate(dimension_names) {
-                            if already_matched.contains(&axis) {
-                                continue;
-                            }
-                            for ManifestSplitDim {
-                                condition: dim_condition,
-                                num_chunks: split_size,
-                            } in dim_specs.iter()
-                            {
-                                if dim_condition.matches(axis, dimname.clone().into()) {
-                                    edges[axis] = uniform_manifest_split_edges(
-                                        num_chunks[axis],
-                                        split_size,
-                                    );
-                                    already_matched.insert(axis);
-                                    break;
-                                };
-                            }
-                        }
+                    .unwrap_or(repeat_n(DimensionName::NotSpecified, ndim).collect());
+                for (axis, dimname) in itertools::enumerate(dimension_names) {
+                    if already_matched.contains(&axis) {
+                        continue;
+                    }
+                    for ManifestSplitDim {
+                        condition: dim_condition,
+                        num_chunks: split_size,
+                    } in dim_specs.iter()
+                    {
+                        if dim_condition.matches(axis, dimname.clone().into()) {
+                            edges[axis] = uniform_manifest_split_edges(
+                                num_chunks[axis],
+                                split_size,
+                            );
+                            already_matched.insert(axis);
+                            break;
+                        };
                     }
                 }
-                Ok(ManifestSplits::from_edges(edges))
             }
         }
+        ManifestSplits::from_edges(edges)
     }
 }
 
@@ -1739,8 +1860,18 @@ async fn flush(
                 &node.path,
             )
             .await?;
-            let splits = splitting_config.get_split_sizes(&new_node)?;
-            flush_data.write_manifest_for_existing_node(&node, splits).await?;
+            if let NodeData::Array { shape, dimension_names, manifests } =
+                new_node.node_data
+            {
+                let splits = splitting_config.get_split_sizes(
+                    &new_node.path,
+                    &shape,
+                    &dimension_names,
+                );
+                flush_data
+                    .write_manifest_for_existing_node(&node, splits, manifests)
+                    .await?;
+            }
         } else {
             trace!(path=%node.path, "Node has no changes, keeping the previous manifest");
             // Array wasn't deleted but has no changes in this session
@@ -1754,15 +1885,7 @@ async fn flush(
 
     for (node_path, node_id) in flush_data.change_set.new_arrays() {
         trace!(path=%node_path, "New node, writing a manifest");
-        let node = get_node(
-            &flush_data.asset_manager,
-            flush_data.change_set,
-            flush_data.parent_id,
-            node_path,
-        )
-        .await?;
-        let splits = splitting_config.get_split_sizes(&node)?;
-        flush_data.write_manifest_for_new_node(node_id, node_path, splits).await?;
+        flush_data.write_manifest_for_new_node(node_id, node_path).await?;
     }
 
     trace!("Building new snapshot");
@@ -1942,13 +2065,17 @@ mod tests {
         refs::{Ref, fetch_tag},
         repository::VersionInfo,
         storage::new_in_memory_storage,
-        strategies::{ShapeDim, empty_writable_session, node_paths, shapes_and_dims},
+        strategies::{
+            ShapeDim, empty_writable_session, manifest_extents, node_paths,
+            shapes_and_dims,
+        },
     };
 
     use super::*;
     use icechunk_macros::tokio_test;
-    use itertools::Itertools;
+    use itertools::{Itertools, all, multizip};
     use pretty_assertions::assert_eq;
+    use proptest::collection::vec;
     use proptest::prelude::{prop_assert, prop_assert_eq};
     use storage::logging::LoggingStorage;
     use test_strategy::proptest;
@@ -2108,6 +2235,149 @@ mod tests {
         prop_assert!(session.delete_group(path.clone()).await.is_ok());
     }
 
+    #[proptest]
+    fn test_property_extents_set_ops_same(
+        #[strategy(manifest_extents(4))] e: ManifestExtents,
+    ) {
+        prop_assert_eq!(e.intersection(&e), Some(e.clone()));
+        prop_assert_eq!(e.union(&e), e.clone());
+        prop_assert_eq!(overlaps(&e, &e), Overlap::Complete);
+    }
+
+    #[proptest]
+    fn test_property_extents_set_ops(
+        #[strategy(manifest_extents(4))] e1: ManifestExtents,
+        #[strategy(manifest_extents(4))] e2: ManifestExtents,
+    ) {
+        let union = e1.union(&e2);
+        let intersection = e1.intersection(&e2);
+
+        prop_assert_eq!(e1.intersection(&union), Some(e1.clone()));
+        prop_assert_eq!(union.intersection(&e1), Some(e1.clone()));
+        prop_assert_eq!(e2.intersection(&union), Some(e2.clone()));
+        prop_assert_eq!(union.intersection(&e2), Some(e2.clone()));
+
+        // order is important for the next 2
+        prop_assert_eq!(overlaps(&union, &e1), Overlap::Complete);
+        prop_assert_eq!(overlaps(&union, &e2), Overlap::Complete);
+
+        if intersection.is_some() {
+            let int = intersection.unwrap();
+            let expected = if e1 == e1 { Overlap::Complete } else { Overlap::Partial };
+            prop_assert_eq!(overlaps(&e1, &int), expected.clone());
+            prop_assert_eq!(overlaps(&e2, &int), expected);
+        } else {
+            prop_assert_eq!(overlaps(&e1, &e2), Overlap::None);
+            prop_assert_eq!(overlaps(&e2, &e1), Overlap::None);
+        }
+    }
+
+    #[proptest]
+    fn test_property_extents_widths(
+        #[strategy(manifest_extents(4))] extent1: ManifestExtents,
+        #[strategy(vec(0..100, 4))] delta_left: Vec<i32>,
+        #[strategy(vec(0..100, 4))] delta_right: Vec<i32>,
+    ) {
+        let widths = extent1.iter().map(|r| (r.end - r.start) as i32).collect::<Vec<_>>();
+        let extent2 = ManifestExtents::from_ranges_iter(
+            multizip((extent1.iter(), delta_left.iter(), delta_right.iter())).map(
+                |(extent, dleft, dright)| {
+                    ((extent.start as i32 + dleft) as u32)
+                        ..((extent.end as i32 + dright) as u32)
+                },
+            ),
+        );
+
+        if all(delta_left.iter(), |elem| elem == &0i32)
+            && all(delta_right.iter(), |elem| elem == &0i32)
+        {
+            prop_assert_eq!(overlaps(&extent1, &extent2), Overlap::Complete);
+        }
+
+        let extent2 = ManifestExtents::from_ranges_iter(
+            multizip((
+                extent1.iter(),
+                widths.iter(),
+                delta_left.iter(),
+                delta_right.iter(),
+            ))
+            .map(|(extent, width, dleft, dright)| {
+                let (low, high) = (dleft.min(dright), dleft.max(dright));
+                ((extent.start as i32 + width + low) as u32)
+                    ..((extent.end as i32 + width + high) as u32)
+            }),
+        );
+
+        prop_assert_eq!(overlaps(&extent1, &extent2), Overlap::None);
+
+        let extent2 = ManifestExtents::from_ranges_iter(
+            multizip((
+                extent1.iter(),
+                widths.iter(),
+                delta_left.iter(),
+                delta_right.iter(),
+            ))
+            .map(|(extent, width, dleft, dright)| {
+                let (low, high) = (dleft.min(dright), dleft.max(dright));
+                ((extent.start as i32 - width - high).max(0i32) as u32)
+                    ..((extent.end as i32 - width - low) as u32)
+            }),
+        );
+        prop_assert_eq!(overlaps(&extent1, &extent2), Overlap::None);
+
+        let extent2 = ManifestExtents::from_ranges_iter(
+            multizip((extent1.iter(), delta_left.iter(), delta_right.iter())).map(
+                |(extent, dleft, dright)| {
+                    ((extent.start as i32 - dleft - 1).max(0i32) as u32)
+                        ..((extent.end as i32 + dright + 1) as u32)
+                },
+            ),
+        );
+        prop_assert_eq!(overlaps(&extent1, &extent2), Overlap::Partial);
+    }
+
+    #[icechunk_macros::test]
+    fn test_overlaps() -> Result<(), Box<dyn Error>> {
+        let e1 = ManifestExtents::new(
+            vec![0u32, 1, 2].as_slice(),
+            vec![2u32, 4, 6].as_slice(),
+        );
+
+        let e2 = ManifestExtents::new(
+            vec![10u32, 1, 2].as_slice(),
+            vec![12u32, 4, 6].as_slice(),
+        );
+
+        let union = ManifestExtents::new(
+            vec![0u32, 1, 2].as_slice(),
+            vec![12u32, 4, 6].as_slice(),
+        );
+
+        assert_eq!(overlaps(&e1, &e2), Overlap::None);
+        assert_eq!(e1.intersection(&e2), None);
+        assert_eq!(e1.union(&e2), union);
+
+
+        let e1 = ManifestExtents::new(
+            vec![0u32, 1, 2].as_slice(),
+            vec![2u32, 4, 6].as_slice(),
+        );
+        let e2 = ManifestExtents::new(
+            vec![2u32, 1, 2].as_slice(),
+            vec![42u32, 4, 6].as_slice(),
+        );
+        assert_eq!(overlaps(&e1, &e2), Overlap::None);
+        assert_eq!(overlaps(&e2, &e1), Overlap::None);
+
+        // this should create non-overlapping extents
+        let splits = ManifestSplits::from_edges(vec![vec![0, 10, 20], vec![0, 1, 2], vec![0, 21, 22]]);
+        for vec in splits.iter().combinations(2) {
+            assert_eq!(overlaps(vec[0], vec[1]), Overlap::None)
+        }
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_which_split() -> Result<(), Box<dyn Error>> {
         let splits = ManifestSplits::from_edges(vec![vec![0, 10, 20]]);
diff --git a/icechunk/src/strategies.rs b/icechunk/src/strategies.rs
index 2b3a20310..350a7f57f 100644
--- a/icechunk/src/strategies.rs
+++ b/icechunk/src/strategies.rs
@@ -7,11 +7,14 @@ use proptest::prelude::*;
 use proptest::{collection::vec, option, strategy::Strategy};
 
 use crate::Repository;
+use crate::format::manifest::ManifestExtents;
 use crate::format::snapshot::{ArrayShape, DimensionName};
 use crate::format::{ChunkIndices, Path};
 use crate::session::Session;
 use crate::storage::new_in_memory_storage;
 
+const MAX_NDIM: usize = 4;
+
 pub fn node_paths() -> impl Strategy<Value = Path> {
     // FIXME: Add valid paths
     #[allow(clippy::expect_used)]
@@ -66,7 +69,7 @@ pub struct ShapeDim {
 
 pub fn shapes_and_dims(max_ndim: Option<usize>) -> impl Strategy<Value = ShapeDim> {
     // FIXME: ndim = 0
-    let max_ndim = max_ndim.unwrap_or(4usize);
+    let max_ndim = max_ndim.unwrap_or(MAX_NDIM);
     vec(1u64..26u64, 1..max_ndim)
         .prop_flat_map(|shape| {
             let ndim = shape.len();
@@ -97,6 +100,15 @@ pub fn shapes_and_dims(max_ndim: Option<usize>) -> impl Strategy<Value = ShapeDi
         })
 }
 
+pub fn manifest_extents(ndim: usize) -> impl Strategy<Value = ManifestExtents> {
+    (vec(0u32..1000u32, ndim), vec(0u32..1000u32, ndim)).prop_map(|(start, delta)| {
+        let stop = std::iter::zip(start.iter(), delta.iter())
+            .map(|(s, d)| s + d)
+            .collect::<Vec<_>>();
+        ManifestExtents::new(start.as_slice(), stop.as_slice())
+    })
+}
+
 //prop_compose! {
 //    pub fn zarr_array_metadata()(
 //        chunk_key_encoding: ChunkKeyEncoding,

From 3a19559fbfa91c8ec9c8c4b75a7ab8aa040b8b01 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Fri, 6 Jun 2025 13:11:14 -0600
Subject: [PATCH 02/43] Refactor to track splitting on Session

---
 icechunk/src/change_set.rs |  99 ++++++----------------
 icechunk/src/session.rs    | 168 +++++++++++++++++++++++++------------
 2 files changed, 141 insertions(+), 126 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index d42830b98..e48aec6e0 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -8,13 +8,12 @@ use itertools::{Either, Itertools as _};
 use serde::{Deserialize, Serialize};
 
 use crate::{
-    config::ManifestSplittingConfig,
     format::{
         ChunkIndices, NodeId, Path,
         manifest::{ChunkInfo, ChunkPayload, ManifestExtents, ManifestSplits},
         snapshot::{ArrayShape, DimensionName, NodeData, NodeSnapshot},
     },
-    session::SessionResult,
+    session::{SessionResult, which_extent_and_index},
 };
 
 #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
@@ -24,15 +23,6 @@ pub struct ArrayData {
     pub user_data: Bytes,
 }
 
-impl ManifestSplits {
-    pub fn which_extent(&self, coord: &ChunkIndices) -> SessionResult<&ManifestExtents> {
-        Ok(self.0.get(self.which(coord)?).expect(&format!(
-            "logic bug, could not find ManifestExtents for this coordinate: {:?}",
-            coord
-        )))
-    }
-}
-
 #[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
 pub struct SplitManifest {
     from: Vec<u32>,
@@ -85,12 +75,6 @@ impl SplitManifest {
 
 #[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
 pub struct ChangeSet {
-    // splitting configuration is recorded at the time the writable session is created
-    // we ignore any succeeding changes in repository config.
-    splitting: ManifestSplittingConfig,
-    // This is an optimization so that we needn't figure out the split sizes on every set.
-    // TODO: consider merging with `set_chunks` BTreeMap
-    splits: HashMap<NodeId, ManifestSplits>,
     new_groups: HashMap<Path, (NodeId, Bytes)>,
     new_arrays: HashMap<Path, (NodeId, ArrayData)>,
     updated_arrays: HashMap<NodeId, ArrayData>,
@@ -103,10 +87,6 @@ pub struct ChangeSet {
 }
 
 impl ChangeSet {
-    pub fn new(splitting: ManifestSplittingConfig) -> Self {
-        Self { splitting, ..Default::default() }
-    }
-
     pub fn deleted_arrays(&self) -> impl Iterator<Item = &(Path, NodeId)> {
         self.deleted_arrays.iter()
     }
@@ -127,10 +107,6 @@ impl ChangeSet {
         self.deleted_arrays.contains(path_and_id)
     }
 
-    pub fn splits(&self, id: &NodeId) -> Option<&ManifestSplits> {
-        self.splits.get(id)
-    }
-
     pub fn chunk_changes(
         &self,
     ) -> impl Iterator<Item = (&NodeId, impl Iterator<Item = &ChunkIndices>)> {
@@ -174,39 +150,11 @@ impl ChangeSet {
         }
     }
 
-    fn maybe_update_cached_splits(
-        &mut self,
-        node_id: &NodeId,
-        path: &Path,
-        shape: &ArrayShape,
-        dimension_names: &Option<Vec<DimensionName>>,
-    ) {
-        if !self.splits.contains_key(node_id) {
-            // Q: What happens if we set a chunk, then change a dimension name, so
-            //   that the split changes.
-            // A: We ignore it. splits are set once for a node in a session, and are never changed.
-            let splits = self.splitting.get_split_sizes(path, shape, dimension_names);
-            self.splits.insert(node_id.clone(), splits);
-        }
-    }
-
     pub fn add_array(&mut self, path: Path, node_id: NodeId, array_data: ArrayData) {
-        self.maybe_update_cached_splits(
-            &node_id,
-            &path,
-            &array_data.shape,
-            &array_data.dimension_names,
-        );
         self.new_arrays.insert(path, (node_id, array_data));
     }
 
     pub fn update_array(&mut self, node_id: &NodeId, path: &Path, array_data: ArrayData) {
-        self.maybe_update_cached_splits(
-            &node_id,
-            &path,
-            &array_data.shape,
-            &array_data.dimension_names,
-        );
         match self.new_arrays.get(path) {
             Some((id, _)) => {
                 debug_assert!(!self.updated_arrays.contains_key(id));
@@ -241,7 +189,6 @@ impl ChangeSet {
 
         self.updated_arrays.remove(node_id);
         self.set_chunks.remove(node_id);
-        self.splits.remove(node_id);
         if !is_new_array {
             self.deleted_arrays.insert((path, node_id.clone()));
         }
@@ -269,13 +216,9 @@ impl ChangeSet {
         node_id: NodeId,
         coord: ChunkIndices,
         data: Option<ChunkPayload>,
+        splits: &ManifestSplits,
     ) {
-        let cached_splits = self.splits.get(&node_id).expect(&format!(
-            "logic bug. change_set.splits should be populated for node {}",
-            node_id
-        ));
-
-        let extent = cached_splits.which_extent(&coord).expect("logic bug. Trying to set chunk ref but can't find the appropriate split manifest.");
+        let (_, extent) = splits.which_extent_and_index(&coord).expect("logic bug. Trying to set chunk ref but can't find the appropriate split manifest.");
         // this implementation makes delete idempotent
         // it allows deleting a deleted chunk by repeatedly setting None.
         self.set_chunks
@@ -285,7 +228,7 @@ impl ChangeSet {
             })
             .or_insert_with(|| {
                 let mut h = HashMap::<ManifestExtents, SplitManifest>::with_capacity(
-                    cached_splits.len(),
+                    splits.len(),
                 );
                 h.entry(extent.clone())
                     // TODO: this is duplicative. I can't use `or_default` because it's
@@ -301,17 +244,16 @@ impl ChangeSet {
         node_id: &NodeId,
         coords: &ChunkIndices,
     ) -> Option<&Option<ChunkPayload>> {
-        self.splits
-            .get(node_id)
-            .and_then(|splits| {
-                splits.which_extent(coords).ok().map(|extent| {
-                    self.set_chunks
-                        .get(node_id)
-                        .and_then(|h| h.get(&extent))
-                        .and_then(|s| s.chunks.get(coords))
+        if let Some(node_chunks) = self.set_chunks.get(node_id) {
+            which_extent_and_index(node_chunks.keys(), coords)
+                .ok()
+                .map(|(_, extent)| {
+                    node_chunks.get(&extent).and_then(|s| s.chunks.get(coords))
                 })
-            })
-            .flatten()
+                .flatten()
+        } else {
+            None
+        }
     }
 
     /// Drop the updated chunk references for the node.
@@ -599,7 +541,7 @@ mod tests {
         change_set::ArrayData,
         format::{
             ChunkIndices, NodeId,
-            manifest::{ChunkInfo, ChunkPayload},
+            manifest::{ChunkInfo, ChunkPayload, ManifestSplits},
             snapshot::ArrayShape,
         },
     };
@@ -631,28 +573,39 @@ mod tests {
         );
         assert_eq!(None, change_set.new_arrays_chunk_iterator().next());
 
-        change_set.set_chunk_ref(node_id1.clone(), ChunkIndices(vec![0, 1]), None);
+        let splits = ManifestSplits::from_edges(vec![vec![0, 10], vec![0, 10]]);
+
+        change_set.set_chunk_ref(
+            node_id1.clone(),
+            ChunkIndices(vec![0, 1]),
+            None,
+            &splits,
+        );
         assert_eq!(None, change_set.new_arrays_chunk_iterator().next());
 
         change_set.set_chunk_ref(
             node_id1.clone(),
             ChunkIndices(vec![1, 0]),
             Some(ChunkPayload::Inline("bar1".into())),
+            &splits,
         );
         change_set.set_chunk_ref(
             node_id1.clone(),
             ChunkIndices(vec![1, 1]),
             Some(ChunkPayload::Inline("bar2".into())),
+            &splits,
         );
         change_set.set_chunk_ref(
             node_id2.clone(),
             ChunkIndices(vec![0]),
             Some(ChunkPayload::Inline("baz1".into())),
+            &splits,
         );
         change_set.set_chunk_ref(
             node_id2.clone(),
             ChunkIndices(vec![1]),
             Some(ChunkPayload::Inline("baz2".into())),
+            &splits,
         );
 
         {
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 8175242a4..e1650a74b 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -161,31 +161,45 @@ impl From<VirtualReferenceError> for SessionError {
 
 pub type SessionResult<T> = Result<T, SessionError>;
 
+// Returns the index of split_range that includes ChunkIndices
+// This can be used at write time to split manifests based on the config
+// and at read time to choose which manifest to query for chunk payload
+pub fn which_extent_and_index<'a>(
+    iter: impl Iterator<Item = &'a ManifestExtents>,
+    coord: &ChunkIndices,
+) -> SessionResult<(usize, ManifestExtents)> {
+    // split_range[i] must bound ChunkIndices
+    // 0 <= return value <= split_range.len()
+    // it is possible that split_range does not include a coord. say we have 2x2 split grid
+    // but only split (0,0) and split (1,1) are populated with data.
+    // A coord located in (1, 0) should return Err
+    // Since split_range need not form a regular grid, we must iterate through and find the first result.
+    // ManifestExtents in split_range MUST NOT overlap with each other. How do we ensure this?
+    // ndim must be the same
+    // debug_assert_eq!(coord.0.len(), split_range[0].len());
+    // FIXME: could optimize for unbounded single manifest
+    // Note: I don't think we can distinguish between out of bounds index for the array
+    //       and an index that is part of a split that hasn't been written yet.
+    iter.enumerate()
+        .find(|(_, e)| e.contains(coord.0.as_slice()))
+        .map(|(i, e)| (i, e.clone()))
+        .ok_or(
+            SessionErrorKind::InvalidIndexForSplitManifests { coords: coord.clone() }
+                .into(),
+        )
+}
+
 impl ManifestSplits {
-    // Returns the index of split_range that includes ChunkIndices
-    // This can be used at write time to split manifests based on the config
-    // and at read time to choose which manifest to query for chunk payload
-    pub fn which(&self, coord: &ChunkIndices) -> SessionResult<usize> {
-        // split_range[i] must bound ChunkIndices
-        // 0 <= return value <= split_range.len()
-        // it is possible that split_range does not include a coord. say we have 2x2 split grid
-        // but only split (0,0) and split (1,1) are populated with data.
-        // A coord located in (1, 0) should return Err
-        // Since split_range need not form a regular grid, we must iterate through and find the first result.
-        // ManifestExtents in split_range MUST NOT overlap with each other. How do we ensure this?
-        // ndim must be the same
-        // debug_assert_eq!(coord.0.len(), split_range[0].len());
-        // FIXME: could optimize for unbounded single manifest
-        // Note: I don't think we can distinguish between out of bounds index for the array
-        //       and an index that is part of a split that hasn't been written yet.
-        self.iter()
-            .enumerate()
-            .find(|(_, e)| e.contains(coord.0.as_slice()))
-            .map(|(i, _)| i)
-            .ok_or(
-                SessionErrorKind::InvalidIndexForSplitManifests { coords: coord.clone() }
-                    .into(),
-            )
+    pub fn which_extent_and_index(
+        &self,
+        coord: &ChunkIndices,
+    ) -> SessionResult<(usize, ManifestExtents)> {
+        which_extent_and_index(self.iter(), coord)
+    }
+
+    #[cfg(test)]
+    pub fn which_index(&self, coord: &ChunkIndices) -> SessionResult<usize> {
+        which_extent_and_index(self.iter(), coord).map(|(i, _)| i)
     }
 }
 
@@ -234,6 +248,8 @@ pub struct Session {
     snapshot_id: SnapshotId,
     change_set: ChangeSet,
     default_commit_metadata: SnapshotProperties,
+    // This is an optimization so that we needn't figure out the split sizes on every set.
+    splits: HashMap<NodeId, ManifestSplits>,
 }
 
 impl Session {
@@ -255,6 +271,7 @@ impl Session {
             snapshot_id,
             change_set: ChangeSet::default(),
             default_commit_metadata: SnapshotProperties::default(),
+            splits: Default::default(),
         }
     }
 
@@ -269,7 +286,6 @@ impl Session {
         snapshot_id: SnapshotId,
         default_commit_metadata: SnapshotProperties,
     ) -> Self {
-        let splitting = config.manifest().splitting().clone();
         Self {
             config,
             storage_settings: Arc::new(storage_settings),
@@ -278,8 +294,9 @@ impl Session {
             virtual_resolver,
             branch_name: Some(branch_name),
             snapshot_id,
-            change_set: ChangeSet::new(splitting),
+            change_set: Default::default(),
             default_commit_metadata,
+            splits: Default::default(),
         }
     }
 
@@ -419,6 +436,7 @@ impl Session {
         match self.get_node(&path).await {
             Err(SessionError { kind: SessionErrorKind::NodeNotFound { .. }, .. }) => {
                 let id = NodeId::random();
+                self.cache_splits(&id, &path, &shape, &dimension_names);
                 self.change_set.add_array(
                     path,
                     id,
@@ -447,6 +465,7 @@ impl Session {
         user_data: Bytes,
     ) -> SessionResult<()> {
         self.get_array(path).await.map(|node| {
+            self.cache_splits(&node.id, path, &shape, &dimension_names);
             self.change_set.update_array(
                 &node.id,
                 path,
@@ -511,6 +530,34 @@ impl Session {
         self.set_node_chunk_ref(node_snapshot, coord, data).await
     }
 
+    fn cache_splits(
+        &mut self,
+        node_id: &NodeId,
+        path: &Path,
+        shape: &ArrayShape,
+        dimension_names: &Option<Vec<DimensionName>>,
+    ) {
+        let splitting = self.config.manifest().splitting();
+        // Q: What happens if we set a chunk, then change a dimension name, so
+        //   that the split changes.
+        // A: We ignore it. splits are set once for a node in a session, and are never changed.
+        let splits = splitting.get_split_sizes(path, shape, dimension_names);
+        self.splits.insert(node_id.clone(), splits);
+    }
+
+    fn get_splits(
+        &mut self,
+        node_id: &NodeId,
+        path: &Path,
+        shape: &ArrayShape,
+        dimension_names: &Option<Vec<DimensionName>>,
+    ) -> &ManifestSplits {
+        if !self.splits.contains_key(node_id) {
+            self.cache_splits(node_id, path, shape, dimension_names);
+        }
+        self.splits.get(node_id).expect("should not be possible.")
+    }
+
     // Helper function that accepts a NodeSnapshot instead of a path,
     // this lets us do bulk sets (and deletes) without repeatedly grabbing the node.
     #[instrument(skip(self))]
@@ -520,9 +567,14 @@ impl Session {
         coord: ChunkIndices,
         data: Option<ChunkPayload>,
     ) -> SessionResult<()> {
-        if let NodeData::Array { shape, .. } = node.node_data {
+        if let NodeData::Array { shape, dimension_names, .. } = node.node_data {
             if shape.valid_chunk_coord(&coord) {
-                self.change_set.set_chunk_ref(node.id, coord, data);
+                let splits = self
+                    .get_splits(&node.id, &node.path, &shape, &dimension_names)
+                    // FIXME: this clone is a workaround for two mutable borrows
+                    // on self.change_set
+                    .clone();
+                self.change_set.set_chunk_ref(node.id, coord, data, &splits);
                 Ok(())
             } else {
                 Err(SessionErrorKind::InvalidIndex {
@@ -784,7 +836,7 @@ impl Session {
         let splits = ManifestSplits::from_extents(
             manifests.iter().map(|m| m.extents.clone()).collect(),
         );
-        let index = match splits.which(coords) {
+        let (index, _) = match splits.which_extent_and_index(coords) {
             Ok(index) => index,
             // for an invalid coordinate, we bail.
             // This happens for two cases:
@@ -925,6 +977,7 @@ impl Session {
                     &self.config,
                     message,
                     Some(properties),
+                    &self.splits,
                 )
                 .await
             }
@@ -948,6 +1001,7 @@ impl Session {
                         &self.config,
                         message,
                         Some(properties),
+                        &self.splits,
                     )
                     .await
                 }
@@ -1523,6 +1577,7 @@ struct FlushProcess<'a> {
     change_set: &'a ChangeSet,
     parent_id: &'a SnapshotId,
     config: &'a RepositoryConfig,
+    splits: &'a HashMap<NodeId, ManifestSplits>,
     manifest_refs: HashMap<NodeId, Vec<ManifestRef>>,
     manifest_files: HashSet<ManifestFileInfo>,
 }
@@ -1533,12 +1588,14 @@ impl<'a> FlushProcess<'a> {
         change_set: &'a ChangeSet,
         parent_id: &'a SnapshotId,
         config: &'a RepositoryConfig,
+        splits: &'a HashMap<NodeId, ManifestSplits>,
     ) -> Self {
         Self {
             asset_manager,
             change_set,
             parent_id,
             config,
+            splits,
             manifest_refs: Default::default(),
             manifest_files: Default::default(),
         }
@@ -1605,28 +1662,28 @@ impl<'a> FlushProcess<'a> {
     async fn write_manifest_for_new_node(
         &mut self,
         node_id: &NodeId,
-        node_path: &Path,
     ) -> SessionResult<()> {
-        let splits = self.change_set.splits(node_id).expect(&format!("logic bug, array at {} was added in this changeset, splits should be populated.", node_path));
+        let splits = self.splits.get(node_id).expect(&format!(
+            "getting split for node {} unexpectedly failed",
+            node_id.clone()
+        ));
+
         let mut refs =
             HashMap::<ManifestExtents, ManifestRef>::with_capacity(splits.len());
 
         // TODO: this could be try_fold with the refs HashMap as state
         for extent in splits.iter() {
-            let cs_extents = self
-                .change_set
-                .array_manifest(node_id, extent)
-                .expect("logic bug. there should be a manifest for this extent ")
-                .extents();
-
-            let chunks = stream::iter(
-                self.change_set
-                    .new_array_manifest_chunks_iterator(node_id, extent)
-                    .map(Ok),
-            );
-            self.write_manifest_from_iterator(chunks, cs_extents)
-                .await?
-                .map(|new_ref| refs.insert(extent.clone(), new_ref));
+            if let Some(manifest) = self.change_set.array_manifest(node_id, extent) {
+                let cs_extents = manifest.extents();
+                let chunks = stream::iter(
+                    self.change_set
+                        .new_array_manifest_chunks_iterator(node_id, extent)
+                        .map(Ok),
+                );
+                self.write_manifest_from_iterator(chunks, cs_extents)
+                    .await?
+                    .map(|new_ref| refs.insert(extent.clone(), new_ref));
+            }
         }
         self.finalize_refs(node_id, refs)
     }
@@ -1885,7 +1942,7 @@ async fn flush(
 
     for (node_path, node_id) in flush_data.change_set.new_arrays() {
         trace!(path=%node_path, "New node, writing a manifest");
-        flush_data.write_manifest_for_new_node(node_id, node_path).await?;
+        flush_data.write_manifest_for_new_node(node_id).await?;
     }
 
     trace!("Building new snapshot");
@@ -2001,11 +2058,13 @@ async fn do_commit(
     config: &RepositoryConfig,
     message: &str,
     properties: Option<SnapshotProperties>,
+    splits: &HashMap<NodeId, ManifestSplits>,
 ) -> SessionResult<SnapshotId> {
     info!(branch_name, old_snapshot_id=%snapshot_id, "Commit started");
     let parent_snapshot = snapshot_id.clone();
     let properties = properties.unwrap_or_default();
-    let flush_data = FlushProcess::new(asset_manager, change_set, snapshot_id, config);
+    let flush_data =
+        FlushProcess::new(asset_manager, change_set, snapshot_id, config, splits);
     let new_snapshot = flush(flush_data, message, properties).await?;
 
     debug!(branch_name, new_snapshot_id=%new_snapshot, "Updating branch");
@@ -2382,16 +2441,16 @@ mod tests {
     async fn test_which_split() -> Result<(), Box<dyn Error>> {
         let splits = ManifestSplits::from_edges(vec![vec![0, 10, 20]]);
 
-        assert_eq!(splits.which(&ChunkIndices(vec![1])).unwrap(), 0);
-        assert_eq!(splits.which(&ChunkIndices(vec![11])).unwrap(), 1);
+        assert_eq!(splits.which_index(&ChunkIndices(vec![1])).unwrap(), 0);
+        assert_eq!(splits.which_index(&ChunkIndices(vec![11])).unwrap(), 1);
 
         let edges = vec![vec![0, 10, 20], vec![0, 10, 20]];
 
         let splits = ManifestSplits::from_edges(edges);
-        assert_eq!(splits.which(&ChunkIndices(vec![1, 1])).unwrap(), 0);
-        assert_eq!(splits.which(&ChunkIndices(vec![1, 10])).unwrap(), 1);
-        assert_eq!(splits.which(&ChunkIndices(vec![1, 11])).unwrap(), 1);
-        assert!(splits.which(&ChunkIndices(vec![21, 21])).is_err());
+        assert_eq!(splits.which_index(&ChunkIndices(vec![1, 1])).unwrap(), 0);
+        assert_eq!(splits.which_index(&ChunkIndices(vec![1, 10])).unwrap(), 1);
+        assert_eq!(splits.which_index(&ChunkIndices(vec![1, 11])).unwrap(), 1);
+        assert!(splits.which_index(&ChunkIndices(vec![21, 21])).is_err());
 
         Ok(())
     }
@@ -3180,8 +3239,10 @@ mod tests {
         ds.add_array(a2path.clone(), shape.clone(), dimension_names.clone(), def.clone())
             .await?;
 
+        dbg!("added arrays, now commit");
         let _ = ds.commit("first commit", None).await?;
 
+        dbg!("committed arrays");
         // there should be no manifests yet because we didn't add any chunks
         assert_eq!(
             0,
@@ -3206,6 +3267,7 @@ mod tests {
 
         let mut ds = repo.writable_session("main").await?;
 
+        dbg!("setting chunk ref");
         // add 3 chunks
         ds.set_chunk_ref(
             a1path.clone(),

From 09cd44590f265b20431412d3fbc8414e860743f0 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Fri, 6 Jun 2025 16:08:50 -0600
Subject: [PATCH 03/43] Revert to aggregate_extents

---
 icechunk/src/change_set.rs |  78 ++++----------------
 icechunk/src/repository.rs |   1 -
 icechunk/src/session.rs    | 147 ++++++++++++++++++++++++++++---------
 3 files changed, 125 insertions(+), 101 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index e48aec6e0..ae5fc45a5 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -23,64 +23,13 @@ pub struct ArrayData {
     pub user_data: Bytes,
 }
 
-#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
-pub struct SplitManifest {
-    from: Vec<u32>,
-    to: Vec<u32>,
-    // It's important we keep these sorted, we use this fact in TransactionLog creation
-    chunks: BTreeMap<ChunkIndices, Option<ChunkPayload>>,
-}
-
-impl SplitManifest {
-    pub fn update(&mut self, coord: ChunkIndices, data: Option<ChunkPayload>) {
-        if self.from.is_empty() {
-            debug_assert!(self.to.is_empty());
-            debug_assert!(self.chunks.is_empty());
-            // important to remember that `to` is not inclusive, so we need +1
-            let mut coord0 = coord.0.clone();
-            self.to.extend(coord0.iter().map(|n| *n + 1));
-            self.from.append(&mut coord0);
-        } else {
-            for (existing, coord0) in self.from.iter_mut().zip(coord.0.iter()) {
-                if coord0 < existing {
-                    *existing = *coord0
-                }
-            }
-            for (existing, coord0) in self.to.iter_mut().zip(coord.0.iter()) {
-                // important to remember that `to` is not inclusive, so we need +1
-                let range_value = coord0 + 1;
-                if range_value > *existing {
-                    *existing = range_value
-                }
-            }
-        }
-        self.chunks.insert(coord, data);
-    }
-
-    pub fn retain(&mut self, predicate: impl Fn(&ChunkIndices) -> bool) {
-        self.chunks.retain(|coord, _| {
-            if !predicate(coord) {
-                // FIXME: handle from, to updating
-                todo!();
-            } else {
-                false
-            }
-        })
-    }
-
-    pub fn extents(&self) -> ManifestExtents {
-        ManifestExtents::new(self.from.as_slice(), self.to.as_slice())
-    }
-}
-
+type SplitManifest = BTreeMap<ChunkIndices, Option<ChunkPayload>>;
 #[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)]
 pub struct ChangeSet {
     new_groups: HashMap<Path, (NodeId, Bytes)>,
     new_arrays: HashMap<Path, (NodeId, ArrayData)>,
     updated_arrays: HashMap<NodeId, ArrayData>,
     updated_groups: HashMap<NodeId, Bytes>,
-    // FIXME: It's important we keep these sorted, we use this fact in TransactionLog creation
-    //        Change HashMap -> BTreeMap, need to check Ord on ManifestExtents
     set_chunks: BTreeMap<NodeId, HashMap<ManifestExtents, SplitManifest>>,
     deleted_groups: HashSet<(Path, NodeId)>,
     deleted_arrays: HashSet<(Path, NodeId)>,
@@ -111,7 +60,7 @@ impl ChangeSet {
         &self,
     ) -> impl Iterator<Item = (&NodeId, impl Iterator<Item = &ChunkIndices>)> {
         self.set_chunks.iter().map(|(node_id, split_map)| {
-            (node_id, split_map.values().flat_map(|x| x.chunks.keys()))
+            (node_id, split_map.values().flat_map(|x| x.keys()))
         })
     }
 
@@ -224,17 +173,18 @@ impl ChangeSet {
         self.set_chunks
             .entry(node_id)
             .and_modify(|h| {
-                h.entry(extent.clone()).or_default().update(coord.clone(), data.clone());
+                h.entry(extent.clone()).or_default().insert(coord.clone(), data.clone());
             })
             .or_insert_with(|| {
-                let mut h = HashMap::<ManifestExtents, SplitManifest>::with_capacity(
-                    splits.len(),
-                );
+                let mut h = HashMap::<
+                    ManifestExtents,
+                    BTreeMap<ChunkIndices, Option<ChunkPayload>>,
+                >::with_capacity(splits.len());
                 h.entry(extent.clone())
                     // TODO: this is duplicative. I can't use `or_default` because it's
                     // nice to create the HashMap using `with_capacity`
                     .or_default()
-                    .update(coord, data);
+                    .insert(coord, data);
                 h
             });
     }
@@ -247,9 +197,7 @@ impl ChangeSet {
         if let Some(node_chunks) = self.set_chunks.get(node_id) {
             which_extent_and_index(node_chunks.keys(), coords)
                 .ok()
-                .map(|(_, extent)| {
-                    node_chunks.get(&extent).and_then(|s| s.chunks.get(coords))
-                })
+                .map(|(_, extent)| node_chunks.get(&extent).and_then(|s| s.get(coords)))
                 .flatten()
         } else {
             None
@@ -261,11 +209,11 @@ impl ChangeSet {
     pub fn drop_chunk_changes(
         &mut self,
         node_id: &NodeId,
-        predicate: impl Fn(&ChunkIndices) -> bool + Copy,
+        predicate: impl Fn(&ChunkIndices) -> bool,
     ) {
         if let Some(changes) = self.set_chunks.get_mut(node_id) {
             for split in changes.values_mut() {
-                split.retain(predicate);
+                split.retain(|coord, _| !predicate(coord));
             }
         }
     }
@@ -287,7 +235,7 @@ impl ChangeSet {
                     .filter(move |(manifest_extent, _)| {
                         extent.is_none() || Some(*manifest_extent) == extent.as_ref()
                     })
-                    .flat_map(|(_, manifest)| manifest.chunks.iter()),
+                    .flat_map(|(_, manifest)| manifest.iter()),
             ),
         }
     }
@@ -345,7 +293,7 @@ impl ChangeSet {
         extent: &ManifestExtents,
     ) -> impl Iterator<Item = ChunkInfo> + use<'a> {
         if let Some(manifest) = self.array_manifest(node_id, extent) {
-            Either::Right(manifest.chunks.iter().filter_map(move |(coords, payload)| {
+            Either::Right(manifest.iter().filter_map(move |(coords, payload)| {
                 payload.as_ref().map(|p| ChunkInfo {
                     node: node_id.clone(),
                     coord: coords.clone(),
diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs
index 54668c50c..d778dc9cf 100644
--- a/icechunk/src/repository.rs
+++ b/icechunk/src/repository.rs
@@ -1488,7 +1488,6 @@ mod tests {
 
             add += (axis_size as u32).div_ceil(expected_split_sizes[ax]) as usize
                 - 1 * ((ax > 0) as usize);
-            dbg!(&ax, &add);
             total_manifests += add;
             session.commit(format!("finished axis {0}", ax).as_ref(), None).await?;
             assert_manifest_count(&backend, total_manifests).await;
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index e1650a74b..2ed81e661 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -1605,7 +1605,6 @@ impl<'a> FlushProcess<'a> {
         &mut self,
         node: &NodeSnapshot,
         extent: &ManifestExtents,
-        actual_extents: ManifestExtents,
     ) -> SessionResult<Option<ManifestRef>> {
         let asset_manager = Arc::clone(&self.asset_manager);
         let updated_chunks = updated_node_chunks_iterator(
@@ -1617,14 +1616,17 @@ impl<'a> FlushProcess<'a> {
         )
         .await
         .map_ok(|(_path, chunk_info)| chunk_info);
-        self.write_manifest_from_iterator(updated_chunks, actual_extents).await
+        self.write_manifest_from_iterator(updated_chunks).await
     }
 
     async fn write_manifest_from_iterator(
         &mut self,
         chunks: impl Stream<Item = SessionResult<ChunkInfo>>,
-        actual_extents: ManifestExtents,
     ) -> SessionResult<Option<ManifestRef>> {
+        let mut from = vec![];
+        let mut to = vec![];
+        let chunks = aggregate_extents(&mut from, &mut to, chunks, |ci| &ci.coord);
+
         if let Some(new_manifest) = Manifest::from_stream(chunks).await.unwrap() {
             let new_manifest = Arc::new(new_manifest);
             let new_manifest_size =
@@ -1636,7 +1638,7 @@ impl<'a> FlushProcess<'a> {
 
             let new_ref = ManifestRef {
                 object_id: new_manifest.id().clone(),
-                extents: actual_extents,
+                extents: ManifestExtents::new(&from, &to),
             };
             Ok(Some(new_ref))
         } else {
@@ -1673,14 +1675,13 @@ impl<'a> FlushProcess<'a> {
 
         // TODO: this could be try_fold with the refs HashMap as state
         for extent in splits.iter() {
-            if let Some(manifest) = self.change_set.array_manifest(node_id, extent) {
-                let cs_extents = manifest.extents();
+            if self.change_set.array_manifest(node_id, extent).is_some() {
                 let chunks = stream::iter(
                     self.change_set
                         .new_array_manifest_chunks_iterator(node_id, extent)
                         .map(Ok),
                 );
-                self.write_manifest_from_iterator(chunks, cs_extents)
+                self.write_manifest_from_iterator(chunks)
                     .await?
                     .map(|new_ref| refs.insert(extent.clone(), new_ref));
             }
@@ -1716,28 +1717,17 @@ impl<'a> FlushProcess<'a> {
 
         // TODO: this should be try_fold with the refs HashMap as state
         for extent in splits.iter() {
-            let on_disk_bbox = on_disk_extents
-                .iter()
-                .filter_map(|e| e.intersection(extent))
-                .reduce(|a, b| a.union(&b));
-
             if modified_splits.contains(extent) {
                 // this split was modified in this session, rewrite it completely
-                let cs_extents = self
-                    .change_set
-                    .array_manifest(&node.id, extent)
-                    .expect("logic bug. there should be a manifest for this extent ")
-                    .extents();
-                let actual_extents =
-                    // if there are splits on disk that overlap, then we take that Extents
-                    // and union it with the Extents of the chunks the changeset
-                    on_disk_bbox.map(|x| cs_extents.union(&x))
-                    // if no overlap, then just use the changeset Extents
-                    .unwrap_or(cs_extents);
-                self.write_manifest_for_updated_chunks(&node, extent, actual_extents)
+                self.write_manifest_for_updated_chunks(&node, extent)
                     .await?
                     .map(|new_ref| refs.insert(extent.clone(), new_ref));
             } else {
+                let on_disk_bbox = on_disk_extents
+                    .iter()
+                    .filter_map(|e| e.intersection(extent))
+                    .reduce(|a, b| a.union(&b));
+
                 // split was unmodified in this session. Let's look at the current manifests
                 // and see what we need to do with them
                 for old_ref in manifests.iter() {
@@ -1753,13 +1743,9 @@ impl<'a> FlushProcess<'a> {
                             // the splits have changed, but no refs in this split have been written in this session
                             // same as `if` block above
                             debug_assert!(on_disk_bbox.is_some());
-                            self.write_manifest_for_updated_chunks(
-                                &node,
-                                extent,
-                                on_disk_bbox.clone().expect("logic bug in writing manifests from disk for partially overlapping split"),
-                            )
-                            .await?
-                            .map(|new_ref| refs.insert(extent.clone(), new_ref));
+                            self.write_manifest_for_updated_chunks(&node, extent)
+                                .await?
+                                .map(|new_ref| refs.insert(extent.clone(), new_ref));
                         }
                         Overlap::None => {
                             // Nothing to do
@@ -2105,6 +2091,63 @@ async fn fetch_manifest(
     Ok(asset_manager.fetch_manifest(manifest_id, manifest_info.size_bytes).await?)
 }
 
+/// Map the iterator to accumulate the extents of the chunks traversed
+///
+/// As we are processing chunks to create a manifest, we need to keep track
+/// of the extents of the manifests. This means, for each coordinate, we need
+/// to record its minimum and maximum values.
+///
+/// This very ugly code does that, without having to traverse the iterator twice.
+/// It adapts the stream using [`StreamExt::map_ok`] and keeps a running min/max
+/// for each coordinate.
+///
+/// When the iterator is fully traversed, the min and max values will be
+/// available in `from` and `to` arguments.
+///
+/// Yes, this is horrible.
+fn aggregate_extents<'a, T: std::fmt::Debug, E>(
+    from: &'a mut Vec<u32>,
+    to: &'a mut Vec<u32>,
+    it: impl Stream<Item = Result<T, E>> + 'a,
+    extract_index: impl for<'b> Fn(&'b T) -> &'b ChunkIndices + 'a,
+) -> impl Stream<Item = Result<T, E>> + 'a {
+    // we initialize the destination with an empty array, because we don't know
+    // the dimensions of the array yet. On the first element we will re-initialize
+    *from = Vec::new();
+    *to = Vec::new();
+    it.map_ok(move |t| {
+        // these are the coordinates for the chunk
+        let idx = extract_index(&t);
+
+        // we need to initialize the mins/maxes the first time
+        // we initialize with the value of the first element
+        // this obviously doesn't work for empty streams
+        // but we never generate manifests for them
+        if from.is_empty() {
+            *from = idx.0.clone();
+            // important to remember that `to` is not inclusive, so we need +1
+            *to = idx.0.iter().map(|n| n + 1).collect();
+        } else {
+            // We need to iterate over coordinates, and update the
+            // minimum and maximum for each if needed
+            for (coord_idx, value) in idx.0.iter().enumerate() {
+                if let Some(from_current) = from.get_mut(coord_idx) {
+                    if value < from_current {
+                        *from_current = *value
+                    }
+                }
+                if let Some(to_current) = to.get_mut(coord_idx) {
+                    let range_value = value + 1;
+                    if range_value > *to_current {
+                        *to_current = range_value
+                    }
+                }
+            }
+        }
+        t
+    })
+}
+
 #[cfg(test)]
 #[allow(clippy::panic, clippy::unwrap_used, clippy::expect_used)]
 mod tests {
@@ -2125,8 +2168,8 @@ mod tests {
         repository::VersionInfo,
         storage::new_in_memory_storage,
         strategies::{
-            ShapeDim, empty_writable_session, manifest_extents, node_paths,
-            shapes_and_dims,
+            ShapeDim, chunk_indices, empty_writable_session, manifest_extents,
+            node_paths, shapes_and_dims,
         },
     };
 
@@ -2416,7 +2459,6 @@ mod tests {
         assert_eq!(e1.intersection(&e2), None);
         assert_eq!(e1.union(&e2), union);
 
-
         let e1 = ManifestExtents::new(
             vec![0u32, 1, 2].as_slice(),
             vec![2u32, 4, 6].as_slice(),
@@ -2429,13 +2471,48 @@ mod tests {
         assert_eq!(overlaps(&e2, &e1), Overlap::None);
 
         // this should create non-overlapping extents
-        let splits = ManifestSplits::from_edges(vec![vec![0, 10, 20], vec![0, 1, 2], vec![0, 21, 22]]);
+        let splits = ManifestSplits::from_edges(vec![
+            vec![0, 10, 20],
+            vec![0, 1, 2],
+            vec![0, 21, 22],
+        ]);
         for vec in splits.iter().combinations(2) {
             assert_eq!(overlaps(vec[0], vec[1]), Overlap::None)
         }
 
         Ok(())
     }
+    #[proptest(async = "tokio")]
+    async fn test_aggregate_extents(
+        #[strategy(proptest::collection::vec(chunk_indices(3, 0..1_000_000), 1..50))]
+        indices: Vec<ChunkIndices>,
+    ) {
+        let mut from = vec![];
+        let mut to = vec![];
+
+        let expected_from = vec![
+            indices.iter().map(|i| i.0[0]).min().unwrap(),
+            indices.iter().map(|i| i.0[1]).min().unwrap(),
+            indices.iter().map(|i| i.0[2]).min().unwrap(),
+        ];
+        let expected_to = vec![
+            indices.iter().map(|i| i.0[0]).max().unwrap() + 1,
+            indices.iter().map(|i| i.0[1]).max().unwrap() + 1,
+            indices.iter().map(|i| i.0[2]).max().unwrap() + 1,
+        ];
+
+        let _ = aggregate_extents(
+            &mut from,
+            &mut to,
+            stream::iter(indices.into_iter().map(Ok::<ChunkIndices, Infallible>)),
+            |idx| idx,
+        )
+        .count()
+        .await;
+
+        prop_assert_eq!(from, expected_from);
+        prop_assert_eq!(to, expected_to);
+    }
 
     #[tokio::test]
     async fn test_which_split() -> Result<(), Box<dyn Error>> {

From 8924a39c35de182127b65b08bd782fb616c5ad0b Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Fri, 6 Jun 2025 16:37:19 -0600
Subject: [PATCH 04/43] Tests pass!

---
 icechunk/src/change_set.rs |  6 +-----
 icechunk/src/session.rs    | 44 ++++++++++++++++----------------------
 2 files changed, 19 insertions(+), 31 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index ae5fc45a5..31af358a6 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -171,7 +171,7 @@ impl ChangeSet {
         // this implementation makes delete idempotent
         // it allows deleting a deleted chunk by repeatedly setting None.
         self.set_chunks
-            .entry(node_id)
+            .entry(node_id.clone())
             .and_modify(|h| {
                 h.entry(extent.clone()).or_default().insert(coord.clone(), data.clone());
             })
@@ -335,9 +335,6 @@ impl ChangeSet {
     /// Results of the merge are applied to `self`. Changes present in `other` take precedence over
     /// `self` changes.
     pub fn merge(&mut self, other: ChangeSet) {
-        // FIXME: what do I do with splitting and splits here.
-        // FIXME: this should detect conflict, for example, if different writers added on the same
-        // path, different objects, or if the same path is added and deleted, etc.
         // TODO: optimize
         self.new_groups.extend(other.new_groups);
         self.new_arrays.extend(other.new_arrays);
@@ -345,7 +342,6 @@ impl ChangeSet {
         self.updated_arrays.extend(other.updated_arrays);
         self.deleted_groups.extend(other.deleted_groups);
         self.deleted_arrays.extend(other.deleted_arrays);
-        // FIXME: handle splits
 
         for (node, other_chunks) in other.set_chunks.into_iter() {
             match self.set_chunks.remove(&node) {
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 2ed81e661..3b5b9f66d 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -465,7 +465,10 @@ impl Session {
         user_data: Bytes,
     ) -> SessionResult<()> {
         self.get_array(path).await.map(|node| {
-            self.cache_splits(&node.id, path, &shape, &dimension_names);
+            // Q: What happens if we set a chunk, then change a dimension name, so
+            //   that the split changes.
+            // A: We ignore it. splits are set once for a node in a session, and are never changed.
+            // self.cache_splits(&node.id, path, &shape, &dimension_names);
             self.change_set.update_array(
                 &node.id,
                 path,
@@ -538,9 +541,6 @@ impl Session {
         dimension_names: &Option<Vec<DimensionName>>,
     ) {
         let splitting = self.config.manifest().splitting();
-        // Q: What happens if we set a chunk, then change a dimension name, so
-        //   that the split changes.
-        // A: We ignore it. splits are set once for a node in a session, and are never changed.
         let splits = splitting.get_split_sizes(path, shape, dimension_names);
         self.splits.insert(node_id.clone(), splits);
     }
@@ -974,7 +974,6 @@ impl Session {
                     branch_name,
                     &self.snapshot_id,
                     &self.change_set,
-                    &self.config,
                     message,
                     Some(properties),
                     &self.splits,
@@ -998,7 +997,6 @@ impl Session {
                         branch_name,
                         &self.snapshot_id,
                         &self.change_set,
-                        &self.config,
                         message,
                         Some(properties),
                         &self.splits,
@@ -1291,6 +1289,7 @@ async fn verified_node_chunk_iterator<'a>(
             let new_chunks = change_set
                 .array_chunks_iterator(&node.id, &node.path, extent.clone())
                 .filter_map(move |(idx, payload)| {
+                    dbg!("iterating through ", &idx, &payload);
                     payload.as_ref().map(|payload| {
                         Ok(ChunkInfo {
                             node: node_id_c.clone(),
@@ -1576,7 +1575,6 @@ struct FlushProcess<'a> {
     asset_manager: Arc<AssetManager>,
     change_set: &'a ChangeSet,
     parent_id: &'a SnapshotId,
-    config: &'a RepositoryConfig,
     splits: &'a HashMap<NodeId, ManifestSplits>,
     manifest_refs: HashMap<NodeId, Vec<ManifestRef>>,
     manifest_files: HashSet<ManifestFileInfo>,
@@ -1587,14 +1585,12 @@ impl<'a> FlushProcess<'a> {
         asset_manager: Arc<AssetManager>,
         change_set: &'a ChangeSet,
         parent_id: &'a SnapshotId,
-        config: &'a RepositoryConfig,
         splits: &'a HashMap<NodeId, ManifestSplits>,
     ) -> Self {
         Self {
             asset_manager,
             change_set,
             parent_id,
-            config,
             splits,
             manifest_refs: Default::default(),
             manifest_files: Default::default(),
@@ -1636,6 +1632,7 @@ impl<'a> FlushProcess<'a> {
                 ManifestFileInfo::new(new_manifest.as_ref(), new_manifest_size);
             self.manifest_files.insert(file_info);
 
+            dbg!(&from, &to);
             let new_ref = ManifestRef {
                 object_id: new_manifest.id().clone(),
                 extents: ManifestExtents::new(&from, &to),
@@ -1695,10 +1692,13 @@ impl<'a> FlushProcess<'a> {
     async fn write_manifest_for_existing_node(
         &mut self,
         node: &NodeSnapshot,
-        splits: ManifestSplits,
         manifests: Vec<ManifestRef>,
     ) -> SessionResult<()> {
-        // populate with existing refs, if they are compatiblae
+        let splits = self
+            .splits
+            .get(&node.id)
+            .expect(&format!("splits should exist for this node {}", node.id.clone()));
+        // populate with existing refs, if they are compatible
         let mut refs =
             HashMap::<ManifestExtents, ManifestRef>::with_capacity(splits.len());
 
@@ -1877,7 +1877,6 @@ async fn flush(
 
     let old_snapshot =
         flush_data.asset_manager.fetch_snapshot(flush_data.parent_id).await?;
-    let splitting_config = flush_data.config.manifest().splitting();
 
     // We first go through all existing nodes to see if we need to rewrite any manifests
 
@@ -1903,17 +1902,8 @@ async fn flush(
                 &node.path,
             )
             .await?;
-            if let NodeData::Array { shape, dimension_names, manifests } =
-                new_node.node_data
-            {
-                let splits = splitting_config.get_split_sizes(
-                    &new_node.path,
-                    &shape,
-                    &dimension_names,
-                );
-                flush_data
-                    .write_manifest_for_existing_node(&node, splits, manifests)
-                    .await?;
+            if let NodeData::Array { manifests, .. } = new_node.node_data {
+                flush_data.write_manifest_for_existing_node(&node, manifests).await?;
             }
         } else {
             trace!(path=%node.path, "Node has no changes, keeping the previous manifest");
@@ -2041,7 +2031,6 @@ async fn do_commit(
     branch_name: &str,
     snapshot_id: &SnapshotId,
     change_set: &ChangeSet,
-    config: &RepositoryConfig,
     message: &str,
     properties: Option<SnapshotProperties>,
     splits: &HashMap<NodeId, ManifestSplits>,
@@ -2049,8 +2038,7 @@ async fn do_commit(
     info!(branch_name, old_snapshot_id=%snapshot_id, "Commit started");
     let parent_snapshot = snapshot_id.clone();
     let properties = properties.unwrap_or_default();
-    let flush_data =
-        FlushProcess::new(asset_manager, change_set, snapshot_id, config, splits);
+    let flush_data = FlushProcess::new(asset_manager, change_set, snapshot_id, splits);
     let new_snapshot = flush(flush_data, message, properties).await?;
 
     debug!(branch_name, new_snapshot_id=%new_snapshot, "Updating branch");
@@ -2119,6 +2107,8 @@ fn aggregate_extents<'a, T: std::fmt::Debug, E>(
         // these are the coordinates for the chunk
         let idx = extract_index(&t);
 
+        dbg!("processing index ", &idx);
+
         // we need to initialize the mins/maxes the first time
         // we initialize with the value of the first element
         // this obviously doesn't work for empty streams
@@ -2979,6 +2969,8 @@ mod tests {
             Some(ChunkPayload::Inline("new chunk".into()))
         );
 
+        dbg!("deleting chunk");
+
         // we delete a chunk
         ds.set_chunk_ref(new_array_path.clone(), ChunkIndices(vec![0, 0, 1]), None)
             .await?;

From ca2d1713a92c5134011b9aa07e34fa1e6507e3b8 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Fri, 6 Jun 2025 16:43:27 -0600
Subject: [PATCH 05/43] sqw iterator

---
 icechunk/src/change_set.rs | 25 ++++---------------------
 icechunk/src/session.rs    | 11 ++++++++---
 2 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index 31af358a6..f6e882e80 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -244,7 +244,8 @@ impl ChangeSet {
         &self,
     ) -> impl Iterator<Item = (Path, ChunkInfo)> + use<'_> {
         self.new_arrays.iter().flat_map(|(path, (node_id, _))| {
-            self.new_array_chunk_iterator(node_id, path).map(|ci| (path.clone(), ci))
+            self.new_array_chunk_iterator(node_id, path, None)
+                .map(|ci| (path.clone(), ci))
         })
     }
 
@@ -252,8 +253,9 @@ impl ChangeSet {
         &'a self,
         node_id: &'a NodeId,
         node_path: &Path,
+        extent: Option<ManifestExtents>,
     ) -> impl Iterator<Item = ChunkInfo> + use<'a> {
-        self.array_chunks_iterator(node_id, node_path, None).filter_map(
+        self.array_chunks_iterator(node_id, node_path, extent).filter_map(
             move |(coords, payload)| {
                 payload.as_ref().map(|p| ChunkInfo {
                     node: node_id.clone(),
@@ -286,25 +288,6 @@ impl ChangeSet {
         self.set_chunks.get(node_id).and_then(|x| x.get(extent))
     }
 
-    /// Iterator over chunks for a new array for a given ManifestExtents
-    pub fn new_array_manifest_chunks_iterator<'a>(
-        &'a self,
-        node_id: &'a NodeId,
-        extent: &ManifestExtents,
-    ) -> impl Iterator<Item = ChunkInfo> + use<'a> {
-        if let Some(manifest) = self.array_manifest(node_id, extent) {
-            Either::Right(manifest.iter().filter_map(move |(coords, payload)| {
-                payload.as_ref().map(|p| ChunkInfo {
-                    node: node_id.clone(),
-                    coord: coords.clone(),
-                    payload: p.clone(),
-                })
-            }))
-        } else {
-            Either::Left(iter::empty())
-        }
-    }
-
     pub fn new_nodes(&self) -> impl Iterator<Item = (&Path, &NodeId)> {
         self.new_groups().chain(self.new_arrays())
     }
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 3b5b9f66d..21afc6b87 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -897,7 +897,7 @@ impl Session {
         let res = try_stream! {
             let new_chunks = stream::iter(
                 self.change_set
-                    .new_array_chunk_iterator(&node.id, array_path)
+                    .new_array_chunk_iterator(&node.id, array_path, None)
                     .map(|chunk_info| Ok::<ChunkIndices, SessionError>(chunk_info.coord)),
             );
 
@@ -1661,6 +1661,7 @@ impl<'a> FlushProcess<'a> {
     async fn write_manifest_for_new_node(
         &mut self,
         node_id: &NodeId,
+        node_path: &Path,
     ) -> SessionResult<()> {
         let splits = self.splits.get(node_id).expect(&format!(
             "getting split for node {} unexpectedly failed",
@@ -1675,7 +1676,11 @@ impl<'a> FlushProcess<'a> {
             if self.change_set.array_manifest(node_id, extent).is_some() {
                 let chunks = stream::iter(
                     self.change_set
-                        .new_array_manifest_chunks_iterator(node_id, extent)
+                        .new_array_chunk_iterator(
+                            node_id,
+                            node_path,
+                            Some(extent.clone()),
+                        )
                         .map(Ok),
                 );
                 self.write_manifest_from_iterator(chunks)
@@ -1918,7 +1923,7 @@ async fn flush(
 
     for (node_path, node_id) in flush_data.change_set.new_arrays() {
         trace!(path=%node_path, "New node, writing a manifest");
-        flush_data.write_manifest_for_new_node(node_id).await?;
+        flush_data.write_manifest_for_new_node(node_id, node_path).await?;
     }
 
     trace!("Building new snapshot");

From 5e843be6fb37d7eb776d5ff359968b143f082e27 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Fri, 6 Jun 2025 16:45:58 -0600
Subject: [PATCH 06/43] cleanup

---
 icechunk/src/session.rs | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 21afc6b87..16cd43239 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -1289,7 +1289,6 @@ async fn verified_node_chunk_iterator<'a>(
             let new_chunks = change_set
                 .array_chunks_iterator(&node.id, &node.path, extent.clone())
                 .filter_map(move |(idx, payload)| {
-                    dbg!("iterating through ", &idx, &payload);
                     payload.as_ref().map(|payload| {
                         Ok(ChunkInfo {
                             node: node_id_c.clone(),
@@ -1632,7 +1631,6 @@ impl<'a> FlushProcess<'a> {
                 ManifestFileInfo::new(new_manifest.as_ref(), new_manifest_size);
             self.manifest_files.insert(file_info);
 
-            dbg!(&from, &to);
             let new_ref = ManifestRef {
                 object_id: new_manifest.id().clone(),
                 extents: ManifestExtents::new(&from, &to),
@@ -2112,8 +2110,6 @@ fn aggregate_extents<'a, T: std::fmt::Debug, E>(
         // these are the coordinates for the chunk
         let idx = extract_index(&t);
 
-        dbg!("processing index ", &idx);
-
         // we need to initialize the mins/maxes the first time
         // we initialize with the value of the first element
         // this obviously doesn't work for empty streams
@@ -2974,8 +2970,6 @@ mod tests {
             Some(ChunkPayload::Inline("new chunk".into()))
         );
 
-        dbg!("deleting chunk");
-
         // we delete a chunk
         ds.set_chunk_ref(new_array_path.clone(), ChunkIndices(vec![0, 0, 1]), None)
             .await?;

From 03d62accc31885bae1d1d06f06c41717406d3184 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Fri, 6 Jun 2025 17:03:35 -0600
Subject: [PATCH 07/43] lint

---
 icechunk/src/change_set.rs | 10 +++++-----
 icechunk/src/session.rs    | 33 ++++++++++++++++++---------------
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index f6e882e80..87c0e124e 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -69,7 +69,7 @@ impl ChangeSet {
     }
 
     pub fn arrays_with_chunk_changes(&self) -> impl Iterator<Item = &NodeId> {
-        self.set_chunks.iter().map(|(node, _)| node)
+        self.set_chunks.keys()
     }
 
     pub fn is_empty(&self) -> bool {
@@ -167,6 +167,7 @@ impl ChangeSet {
         data: Option<ChunkPayload>,
         splits: &ManifestSplits,
     ) {
+        #[allow(clippy::expect_used)]
         let (_, extent) = splits.which_extent_and_index(&coord).expect("logic bug. Trying to set chunk ref but can't find the appropriate split manifest.");
         // this implementation makes delete idempotent
         // it allows deleting a deleted chunk by repeatedly setting None.
@@ -195,10 +196,9 @@ impl ChangeSet {
         coords: &ChunkIndices,
     ) -> Option<&Option<ChunkPayload>> {
         if let Some(node_chunks) = self.set_chunks.get(node_id) {
-            which_extent_and_index(node_chunks.keys(), coords)
-                .ok()
-                .map(|(_, extent)| node_chunks.get(&extent).and_then(|s| s.get(coords)))
-                .flatten()
+            which_extent_and_index(node_chunks.keys(), coords).ok().and_then(
+                |(_, extent)| node_chunks.get(&extent).and_then(|s| s.get(coords)),
+            )
         } else {
             None
         }
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 16cd43239..dfa91d662 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -229,11 +229,11 @@ pub fn overlaps(us: &ManifestExtents, them: &ManifestExtents) -> Overlap {
     }
 
     if overlaps.iter().all(|x| x == &Overlap::Complete) {
-        return Overlap::Complete;
+        Overlap::Complete
     } else if overlaps.iter().any(|x| x == &Overlap::None) {
-        return Overlap::None;
+        Overlap::None
     } else {
-        return Overlap::Partial;
+        Overlap::Partial
     }
 }
 
@@ -555,6 +555,7 @@ impl Session {
         if !self.splits.contains_key(node_id) {
             self.cache_splits(node_id, path, shape, dimension_names);
         }
+        #[allow(clippy::expect_used)]
         self.splits.get(node_id).expect("should not be possible.")
     }
 
@@ -1303,7 +1304,7 @@ async fn verified_node_chunk_iterator<'a>(
                     futures::stream::iter(manifests)
                         .filter(move |manifest_ref| {
                             futures::future::ready(extent.as_ref().is_none_or(|e| {
-                                overlaps(&manifest_ref.extents, &e) != Overlap::None
+                                overlaps(&manifest_ref.extents, e) != Overlap::None
                             }))
                         })
                         .then(move |manifest_ref| {
@@ -1622,7 +1623,11 @@ impl<'a> FlushProcess<'a> {
         let mut to = vec![];
         let chunks = aggregate_extents(&mut from, &mut to, chunks, |ci| &ci.coord);
 
-        if let Some(new_manifest) = Manifest::from_stream(chunks).await.unwrap() {
+        #[allow(clippy::expect_used)]
+        if let Some(new_manifest) = Manifest::from_stream(chunks)
+            .await
+            .expect("failed to create manifest from chunk stream")
+        {
             let new_manifest = Arc::new(new_manifest);
             let new_manifest_size =
                 self.asset_manager.write_manifest(Arc::clone(&new_manifest)).await?;
@@ -1661,10 +1666,9 @@ impl<'a> FlushProcess<'a> {
         node_id: &NodeId,
         node_path: &Path,
     ) -> SessionResult<()> {
-        let splits = self.splits.get(node_id).expect(&format!(
-            "getting split for node {} unexpectedly failed",
-            node_id.clone()
-        ));
+        #[allow(clippy::expect_used)]
+        let splits =
+            self.splits.get(node_id).expect("getting split for node unexpectedly failed");
 
         let mut refs =
             HashMap::<ManifestExtents, ManifestRef>::with_capacity(splits.len());
@@ -1697,10 +1701,9 @@ impl<'a> FlushProcess<'a> {
         node: &NodeSnapshot,
         manifests: Vec<ManifestRef>,
     ) -> SessionResult<()> {
-        let splits = self
-            .splits
-            .get(&node.id)
-            .expect(&format!("splits should exist for this node {}", node.id.clone()));
+        #[allow(clippy::expect_used)]
+        let splits =
+            self.splits.get(&node.id).expect("splits should exist for this node.");
         // populate with existing refs, if they are compatible
         let mut refs =
             HashMap::<ManifestExtents, ManifestRef>::with_capacity(splits.len());
@@ -1722,7 +1725,7 @@ impl<'a> FlushProcess<'a> {
         for extent in splits.iter() {
             if modified_splits.contains(extent) {
                 // this split was modified in this session, rewrite it completely
-                self.write_manifest_for_updated_chunks(&node, extent)
+                self.write_manifest_for_updated_chunks(node, extent)
                     .await?
                     .map(|new_ref| refs.insert(extent.clone(), new_ref));
             } else {
@@ -1746,7 +1749,7 @@ impl<'a> FlushProcess<'a> {
                             // the splits have changed, but no refs in this split have been written in this session
                             // same as `if` block above
                             debug_assert!(on_disk_bbox.is_some());
-                            self.write_manifest_for_updated_chunks(&node, extent)
+                            self.write_manifest_for_updated_chunks(node, extent)
                                 .await?
                                 .map(|new_ref| refs.insert(extent.clone(), new_ref));
                         }

From ef37e5813a40ff41d29fdb1b5dd4301ff93e6253 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Fri, 6 Jun 2025 22:41:25 -0600
Subject: [PATCH 08/43] Fix bug

---
 icechunk-python/tests/test_manifest_splitting.py |  2 +-
 icechunk/src/format/mod.rs                       |  2 +-
 icechunk/src/session.rs                          | 14 +++++++++++++-
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/icechunk-python/tests/test_manifest_splitting.py b/icechunk-python/tests/test_manifest_splitting.py
index b29fb7b18..479cecd33 100644
--- a/icechunk-python/tests/test_manifest_splitting.py
+++ b/icechunk-python/tests/test_manifest_splitting.py
@@ -102,7 +102,7 @@ def test_manifest_splitting_appends():
         nchunks += math.prod(NEWSHAPE) * 2
         # the lon size goes from 17 -> 19 so one extra manifest,
         # compared to previous writes
-        nmanifests += 7 * 2
+        nmanifests += 2 * 2
 
         assert len(os.listdir(f"{tmpdir}/chunks")) == nchunks
         assert len(os.listdir(f"{tmpdir}/manifests")) == nmanifests
diff --git a/icechunk/src/format/mod.rs b/icechunk/src/format/mod.rs
index 58f1ebc7c..557c0b778 100644
--- a/icechunk/src/format/mod.rs
+++ b/icechunk/src/format/mod.rs
@@ -244,7 +244,7 @@ pub enum IcechunkFormatErrorKind {
     NodeNotFound { path: Path },
     #[error("chunk coordinates not found `{coords:?}`")]
     ChunkCoordinatesNotFound { coords: ChunkIndices },
-    #[error("manifest information cannot be found in snapshot `{manifest_id}`")]
+    #[error("manifest information cannot be found in snapshot for id `{manifest_id}`")]
     ManifestInfoNotFound { manifest_id: ManifestId },
     #[error("invalid magic numbers in file")]
     InvalidMagicNumbers, // TODO: add more info
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index dfa91d662..fe27f1e41 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -1700,6 +1700,7 @@ impl<'a> FlushProcess<'a> {
         &mut self,
         node: &NodeSnapshot,
         manifests: Vec<ManifestRef>,
+        old_snapshot: &Snapshot,
     ) -> SessionResult<()> {
         #[allow(clippy::expect_used)]
         let splits =
@@ -1744,6 +1745,11 @@ impl<'a> FlushProcess<'a> {
                             debug_assert!(on_disk_bbox.is_some());
                             // Just propagate this ref again, no rewriting necessary
                             refs.insert(extent.clone(), old_ref.clone());
+                            // OK to unwrap here since this manifest file must exist in the old snapshot
+                            #[allow(clippy::expect_used)]
+                            self.manifest_files.insert(
+                                old_snapshot.manifest_info(&old_ref.object_id).expect("logic bug. creating manifest file info for an existing manifest failed."),
+                            );
                         }
                         Overlap::Partial => {
                             // the splits have changed, but no refs in this split have been written in this session
@@ -1909,7 +1915,13 @@ async fn flush(
             )
             .await?;
             if let NodeData::Array { manifests, .. } = new_node.node_data {
-                flush_data.write_manifest_for_existing_node(&node, manifests).await?;
+                flush_data
+                    .write_manifest_for_existing_node(
+                        &node,
+                        manifests,
+                        old_snapshot.as_ref(),
+                    )
+                    .await?;
             }
         } else {
             trace!(path=%node.path, "Node has no changes, keeping the previous manifest");

From 9fdd0d3ec53e172fc96c175c86b1aba844152863 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Mon, 9 Jun 2025 13:37:44 -0600
Subject: [PATCH 09/43] Minor changes

---
 icechunk/proptest-regressions/session.txt |  2 +
 icechunk/src/change_set.rs                |  2 +-
 icechunk/src/conflicts/detector.rs        |  2 +-
 icechunk/src/format/manifest.rs           |  7 +--
 icechunk/src/format/transaction_log.rs    |  2 +-
 icechunk/src/session.rs                   | 56 ++++++++++++++++-------
 icechunk/src/strategies.rs                |  2 +-
 7 files changed, 50 insertions(+), 23 deletions(-)

diff --git a/icechunk/proptest-regressions/session.txt b/icechunk/proptest-regressions/session.txt
index ce82f8b5d..f938988b2 100644
--- a/icechunk/proptest-regressions/session.txt
+++ b/icechunk/proptest-regressions/session.txt
@@ -8,3 +8,5 @@ cc da94eced751096504c0803bed6ad66cde255567c3cf6c0b316cce66c22e3142a # shrinks to
 cc b0a66d6fdd012c51dd804b9f6c58e4403b2dd41f15c3856adc8d90e3d42311fc # shrinks to (initial_state, transitions, seen_counter) = (RepositoryModel { arrays: {}, groups: [] }, [AddArray(Path(Utf8PathBuf { _encoding: "unix", inner: "/" }), ZarrArrayMetadata { shape: [1], data_type: Bool, chunk_shape: ChunkShape([1]), chunk_key_encoding: Slash, fill_value: Bool(false), codecs: [Codec { name: "mycodec", configuration: None }], storage_transformers: Some([StorageTransformer { name: "mytransformer", configuration: None }]), dimension_names: None })], None)
 cc 4f7049d25e420db7b98fcadb0fe6bc7576d3bbc6eb3b971074e6f7257282d040 # shrinks to input = _TestAddDeleteArrayArgs { path: Path(Utf8PathBuf { _encoding: "unix", inner: "/" }), metadata: ZarrArrayMetadata { shape: [1], data_type: Bool, chunk_shape: ChunkShape([1]), chunk_key_encoding: Slash, fill_value: Bool(false), codecs: [Codec { name: "mycodec", configuration: None }], storage_transformers: Some([StorageTransformer { name: "mytransformer", configuration: None }]), dimension_names: None }, session: Session { config: RepositoryConfig { inline_chunk_threshold_bytes: 512, unsafe_overwrite_refs: false, get_partial_values_concurrency: 10, compression: CompressionConfig { algorithm: Zstd, level: 1 }, caching: CachingConfig { snapshots_cache_size: 2, manifests_cache_size: 2, transactions_cache_size: 0, attributes_cache_size: 2, chunks_cache_size: 0 }, storage: None, virtual_chunk_containers: {"file": VirtualChunkContainer { name: "file", url_prefix: "file", store: LocalFileSystem("") }, "tigris": VirtualChunkContainer { name: "tigris", url_prefix: "tigris", store: Tigris }, "s3": VirtualChunkContainer { name: "s3", url_prefix: "s3", store: S3(S3Options { region: None, endpoint_url: None, anonymous: false, allow_http: false }) }, "gcs": VirtualChunkContainer { name: "gcs", url_prefix: "gcs", store: Gcs({}) }, "az": VirtualChunkContainer { name: "az", url_prefix: "az", store: Azure }} }, storage_settings: Settings { concurrency: ConcurrencySettings { max_concurrent_requests_for_object: 5, min_concurrent_request_size: 1 } }, storage: ObjectStorage { config: ObjectStorageConfig { url: "memory:/", prefix: "", options: [] }, store: InMemory { storage: RwLock { data: Storage { next_etag: 3, map: {Path { raw: "config.yaml" }: Entry { data: b"inline_chunk_threshold_bytes: 512\nunsafe_overwrite_refs: false\nget_partial_values_concurrency: 10\ncompression:\n  algorithm: Zstd\n  level: 1\ncaching:\n  snapshots_cache_size: 2\n  manifests_cache_size: 2\n  transactions_cache_size: 0\n  attributes_cache_size: 2\n  chunks_cache_size: 0\nstorage: null\nvirtual_chunk_containers:\n  file:\n    name: file\n    url_prefix: file\n    store: !LocalFileSystem ''\n  tigris:\n    name: tigris\n    url_prefix: tigris\n    store: !Tigris {}\n  s3:\n    name: s3\n    url_prefix: s3\n    store: !S3\n      region: null\n      endpoint_url: null\n      anonymous: false\n      allow_http: false\n  gcs:\n    name: gcs\n    url_prefix: gcs\n    store: !Gcs {}\n  az:\n    name: az\n    url_prefix: az\n    store: !Azure {}\n", last_modified: 2025-01-08T15:41:23.520617823Z, attributes: Attributes({ContentType: AttributeValue("application/yaml")}), e_tag: 2 }, Path { raw: "refs/branch.main/ZZZZZZZZ.json" }: Entry { data: b"{\"snapshot\":\"6EPXF9PX9JPEY8F4Q9KG\"}", last_modified: 2025-01-08T15:41:23.520470841Z, attributes: Attributes({}), e_tag: 1 }, Path { raw: "snapshots/6EPXF9PX9JPEY8F4Q9KG" }: Entry { data: b"ICE\xf0\x9f\xa7\x8aCHUNKic-         \x01\x02\x01(\xb5/\xfd\0H\x1d\x03\0\xb4\x05\x9b\0\x80\x90\x90\0\0\x90\x93\xb4QJ6QBBJ59Y0VSRKHH1GG\xbe2025-01-08T15:41:23.520187204Z\xb6Repository initialized9404Z\x80\x80\x01\0T\xb6U\x14", last_modified: 2025-01-08T15:41:23.520418067Z, attributes: Attributes({Metadata("ic-file-type"): AttributeValue("manifest"), Metadata("ic-spec-ver"): AttributeValue("1"), Metadata("ic-comp-alg"): AttributeValue("zstd"), Metadata("ic-"): AttributeValue("ic-client")}), e_tag: 0 }}, uploads: {} } } } }, asset_resolver: AssetResolver { storage: ObjectStorage { config: ObjectStorageConfig { url: "memory:/", prefix: "", options: [] }, store: InMemory { storage: RwLock { data: Storage { next_etag: 3, map: {Path { raw: "config.yaml" }: Entry { data: b"inline_chunk_threshold_bytes: 512\nunsafe_overwrite_refs: false\nget_partial_values_concurrency: 10\ncompression:\n  algorithm: Zstd\n  level: 1\ncaching:\n  snapshots_cache_size: 2\n  manifests_cache_size: 2\n  transactions_cache_size: 0\n  attributes_cache_size: 2\n  chunks_cache_size: 0\nstorage: null\nvirtual_chunk_containers:\n  file:\n    name: file\n    url_prefix: file\n    store: !LocalFileSystem ''\n  tigris:\n    name: tigris\n    url_prefix: tigris\n    store: !Tigris {}\n  s3:\n    name: s3\n    url_prefix: s3\n    store: !S3\n      region: null\n      endpoint_url: null\n      anonymous: false\n      allow_http: false\n  gcs:\n    name: gcs\n    url_prefix: gcs\n    store: !Gcs {}\n  az:\n    name: az\n    url_prefix: az\n    store: !Azure {}\n", last_modified: 2025-01-08T15:41:23.520617823Z, attributes: Attributes({ContentType: AttributeValue("application/yaml")}), e_tag: 2 }, Path { raw: "refs/branch.main/ZZZZZZZZ.json" }: Entry { data: b"{\"snapshot\":\"6EPXF9PX9JPEY8F4Q9KG\"}", last_modified: 2025-01-08T15:41:23.520470841Z, attributes: Attributes({}), e_tag: 1 }, Path { raw: "snapshots/6EPXF9PX9JPEY8F4Q9KG" }: Entry { data: b"ICE\xf0\x9f\xa7\x8aCHUNKic-         \x01\x02\x01(\xb5/\xfd\0H\x1d\x03\0\xb4\x05\x9b\0\x80\x90\x90\0\0\x90\x93\xb4QJ6QBBJ59Y0VSRKHH1GG\xbe2025-01-08T15:41:23.520187204Z\xb6Repository initialized9404Z\x80\x80\x01\0T\xb6U\x14", last_modified: 2025-01-08T15:41:23.520418067Z, attributes: Attributes({Metadata("ic-file-type"): AttributeValue("manifest"), Metadata("ic-spec-ver"): AttributeValue("1"), Metadata("ic-comp-alg"): AttributeValue("zstd"), Metadata("ic-"): AttributeValue("ic-client")}), e_tag: 0 }}, uploads: {} } } } }, storage_settings: Settings { concurrency: ConcurrencySettings { max_concurrent_requests_for_object: 5, min_concurrent_request_size: 1 } }, num_snapshots: 2, num_manifests: 2, num_transactions: 0, num_attributes: 2, num_chunks: 0, snapshot_cache: Cache { .. }, manifest_cache: Cache { .. }, transactions_cache: Cache { .. }, attributes_cache: Cache { .. }, chunk_cache: Cache { .. } }, virtual_resolver: VirtualChunkResolver { containers: [VirtualChunkContainer { name: "tigris", url_prefix: "tigris", store: Tigris }, VirtualChunkContainer { name: "file", url_prefix: "file", store: LocalFileSystem("") }, VirtualChunkContainer { name: "gcs", url_prefix: "gcs", store: Gcs({}) }, VirtualChunkContainer { name: "s3", url_prefix: "s3", store: S3(S3Options { region: None, endpoint_url: None, anonymous: false, allow_http: false }) }, VirtualChunkContainer { name: "az", url_prefix: "az", store: Azure }], credentials: {}, fetchers: RwLock { data: {} } }, branch_name: Some("main"), snapshot_id: 33add7a6dd4cacef21e4ba67, change_set: ChangeSet { new_groups: {}, new_arrays: {}, updated_arrays: {}, updated_attributes: {}, set_chunks: {}, deleted_groups: {}, deleted_arrays: {} } } }
 cc a4bf17e17086f3e723abccc1307f0b489a5e646e899c08e3b483d3befe15cb26 # shrinks to input = _TestAddArrayGroupClashArgs { path: Path(Utf8PathBuf { _encoding: "unix", inner: "/" }), metadata: ZarrArrayMetadata { shape: [1], data_type: Bool, chunk_shape: ChunkShape([1]), chunk_key_encoding: Slash, fill_value: Bool(false), codecs: [Codec { name: "mycodec", configuration: None }], storage_transformers: Some([StorageTransformer { name: "mytransformer", configuration: None }]), dimension_names: None }, session: Session { config: RepositoryConfig { inline_chunk_threshold_bytes: 512, unsafe_overwrite_refs: false, get_partial_values_concurrency: 10, compression: CompressionConfig { algorithm: Zstd, level: 1 }, caching: CachingConfig { snapshots_cache_size: 2, manifests_cache_size: 2, transactions_cache_size: 0, attributes_cache_size: 2, chunks_cache_size: 0 }, storage: None, virtual_chunk_containers: {"tigris": VirtualChunkContainer { name: "tigris", url_prefix: "tigris", store: Tigris }, "file": VirtualChunkContainer { name: "file", url_prefix: "file", store: LocalFileSystem("") }, "az": VirtualChunkContainer { name: "az", url_prefix: "az", store: Azure }, "s3": VirtualChunkContainer { name: "s3", url_prefix: "s3", store: S3(S3Options { region: None, endpoint_url: None, anonymous: false, allow_http: false }) }, "gcs": VirtualChunkContainer { name: "gcs", url_prefix: "gcs", store: Gcs({}) }} }, storage_settings: Settings { concurrency: ConcurrencySettings { max_concurrent_requests_for_object: 5, min_concurrent_request_size: 1 } }, storage: ObjectStorage { config: ObjectStorageConfig { url: "memory:/", prefix: "", options: [] }, store: InMemory { storage: RwLock { data: Storage { next_etag: 3, map: {Path { raw: "config.yaml" }: Entry { data: b"inline_chunk_threshold_bytes: 512\nunsafe_overwrite_refs: false\nget_partial_values_concurrency: 10\ncompression:\n  algorithm: Zstd\n  level: 1\ncaching:\n  snapshots_cache_size: 2\n  manifests_cache_size: 2\n  transactions_cache_size: 0\n  attributes_cache_size: 2\n  chunks_cache_size: 0\nstorage: null\nvirtual_chunk_containers:\n  tigris:\n    name: tigris\n    url_prefix: tigris\n    store: !Tigris {}\n  file:\n    name: file\n    url_prefix: file\n    store: !LocalFileSystem ''\n  az:\n    name: az\n    url_prefix: az\n    store: !Azure {}\n  s3:\n    name: s3\n    url_prefix: s3\n    store: !S3\n      region: null\n      endpoint_url: null\n      anonymous: false\n      allow_http: false\n  gcs:\n    name: gcs\n    url_prefix: gcs\n    store: !Gcs {}\n", last_modified: 2025-01-08T15:41:23.635133963Z, attributes: Attributes({ContentType: AttributeValue("application/yaml")}), e_tag: 2 }, Path { raw: "refs/branch.main/ZZZZZZZZ.json" }: Entry { data: b"{\"snapshot\":\"2T8MSGGT2FVZBS6094K0\"}", last_modified: 2025-01-08T15:41:23.635089347Z, attributes: Attributes({}), e_tag: 1 }, Path { raw: "snapshots/2T8MSGGT2FVZBS6094K0" }: Entry { data: b"ICE\xf0\x9f\xa7\x8aCHUNKic-         \x01\x02\x01(\xb5/\xfd\0H\x1d\x03\0\xb4\x05\x9b\0\x80\x90\x90\0\0\x90\x93\xb47BZ1BYSVSA8MSS4WB5JG\xbe2025-01-08T15:41:23.634873394Z\xb6Repository initialized5576Z\x80\x80\x01\0T\xb6U\x14", last_modified: 2025-01-08T15:41:23.635043198Z, attributes: Attributes({Metadata("ic-comp-alg"): AttributeValue("zstd"), Metadata("ic-file-type"): AttributeValue("manifest"), Metadata("ic-spec-ver"): AttributeValue("1"), Metadata("ic-"): AttributeValue("ic-client")}), e_tag: 0 }}, uploads: {} } } } }, asset_resolver: AssetResolver { storage: ObjectStorage { config: ObjectStorageConfig { url: "memory:/", prefix: "", options: [] }, store: InMemory { storage: RwLock { data: Storage { next_etag: 3, map: {Path { raw: "config.yaml" }: Entry { data: b"inline_chunk_threshold_bytes: 512\nunsafe_overwrite_refs: false\nget_partial_values_concurrency: 10\ncompression:\n  algorithm: Zstd\n  level: 1\ncaching:\n  snapshots_cache_size: 2\n  manifests_cache_size: 2\n  transactions_cache_size: 0\n  attributes_cache_size: 2\n  chunks_cache_size: 0\nstorage: null\nvirtual_chunk_containers:\n  tigris:\n    name: tigris\n    url_prefix: tigris\n    store: !Tigris {}\n  file:\n    name: file\n    url_prefix: file\n    store: !LocalFileSystem ''\n  az:\n    name: az\n    url_prefix: az\n    store: !Azure {}\n  s3:\n    name: s3\n    url_prefix: s3\n    store: !S3\n      region: null\n      endpoint_url: null\n      anonymous: false\n      allow_http: false\n  gcs:\n    name: gcs\n    url_prefix: gcs\n    store: !Gcs {}\n", last_modified: 2025-01-08T15:41:23.635133963Z, attributes: Attributes({ContentType: AttributeValue("application/yaml")}), e_tag: 2 }, Path { raw: "refs/branch.main/ZZZZZZZZ.json" }: Entry { data: b"{\"snapshot\":\"2T8MSGGT2FVZBS6094K0\"}", last_modified: 2025-01-08T15:41:23.635089347Z, attributes: Attributes({}), e_tag: 1 }, Path { raw: "snapshots/2T8MSGGT2FVZBS6094K0" }: Entry { data: b"ICE\xf0\x9f\xa7\x8aCHUNKic-         \x01\x02\x01(\xb5/\xfd\0H\x1d\x03\0\xb4\x05\x9b\0\x80\x90\x90\0\0\x90\x93\xb47BZ1BYSVSA8MSS4WB5JG\xbe2025-01-08T15:41:23.634873394Z\xb6Repository initialized5576Z\x80\x80\x01\0T\xb6U\x14", last_modified: 2025-01-08T15:41:23.635043198Z, attributes: Attributes({Metadata("ic-comp-alg"): AttributeValue("zstd"), Metadata("ic-file-type"): AttributeValue("manifest"), Metadata("ic-spec-ver"): AttributeValue("1"), Metadata("ic-"): AttributeValue("ic-client")}), e_tag: 0 }}, uploads: {} } } } }, storage_settings: Settings { concurrency: ConcurrencySettings { max_concurrent_requests_for_object: 5, min_concurrent_request_size: 1 } }, num_snapshots: 2, num_manifests: 2, num_transactions: 0, num_attributes: 2, num_chunks: 0, snapshot_cache: Cache { .. }, manifest_cache: Cache { .. }, transactions_cache: Cache { .. }, attributes_cache: Cache { .. }, chunk_cache: Cache { .. } }, virtual_resolver: VirtualChunkResolver { containers: [VirtualChunkContainer { name: "tigris", url_prefix: "tigris", store: Tigris }, VirtualChunkContainer { name: "file", url_prefix: "file", store: LocalFileSystem("") }, VirtualChunkContainer { name: "gcs", url_prefix: "gcs", store: Gcs({}) }, VirtualChunkContainer { name: "az", url_prefix: "az", store: Azure }, VirtualChunkContainer { name: "s3", url_prefix: "s3", store: S3(S3Options { region: None, endpoint_url: None, anonymous: false, allow_http: false }) }], credentials: {}, fetchers: RwLock { data: {} } }, branch_name: Some("main"), snapshot_id: 16914cc21a13f7f5e4c04926, change_set: ChangeSet { new_groups: {}, new_arrays: {}, updated_arrays: {}, updated_attributes: {}, set_chunks: {}, deleted_groups: {}, deleted_arrays: {} } } }
+cc 60be8007bfbb770a1a404577dde928290d9018a6c7ca8240e9f3e73922be174c # shrinks to input = _TestPropertyExtentsSetOpsArgs { e1: ManifestExtents([0..0, 0..0, 0..0, 0..0]), e2: ManifestExtents([0..0, 0..0, 0..0, 0..0]) }
+cc 2ae68e9977c83f4cb719f54d9b37e9e54a660c4b5ec826e1ac34696d4ab0bf4e # shrinks to input = _TestPropertyExtentsSetOpsSameArgs { e: ManifestExtents([0..0, 0..0, 0..0, 0..0]) }
diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index 87c0e124e..302d1d7fb 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -56,7 +56,7 @@ impl ChangeSet {
         self.deleted_arrays.contains(path_and_id)
     }
 
-    pub fn chunk_changes(
+    pub fn changed_chunks(
         &self,
     ) -> impl Iterator<Item = (&NodeId, impl Iterator<Item = &ChunkIndices>)> {
         self.set_chunks.iter().map(|(node_id, split_map)| {
diff --git a/icechunk/src/conflicts/detector.rs b/icechunk/src/conflicts/detector.rs
index d2c4cef24..fc3ab7214 100644
--- a/icechunk/src/conflicts/detector.rs
+++ b/icechunk/src/conflicts/detector.rs
@@ -138,7 +138,7 @@ impl ConflictSolver for ConflictDetector {
             });
 
         let chunks_double_updated =
-            current_changes.chunk_changes().filter_map(|(node_id, changes)| {
+            current_changes.changed_chunks().filter_map(|(node_id, changes)| {
                 let previous_changes: HashSet<_> =
                     previous_change.updated_chunks_for(node_id).collect();
                 if previous_changes.is_empty() {
diff --git a/icechunk/src/format/manifest.rs b/icechunk/src/format/manifest.rs
index f9e89009d..1b707c592 100644
--- a/icechunk/src/format/manifest.rs
+++ b/icechunk/src/format/manifest.rs
@@ -2,6 +2,7 @@ use std::{
     borrow::Cow,
     cmp::{max, min},
     convert::Infallible,
+    iter::zip,
     ops::Range,
     sync::Arc,
 };
@@ -59,16 +60,16 @@ impl ManifestExtents {
 
     pub fn intersection(&self, other: &Self) -> Option<Self> {
         debug_assert_eq!(self.len(), other.len());
-        let ranges = std::iter::zip(self.iter(), other.iter())
+        let ranges = zip(self.iter(), other.iter())
             .map(|(a, b)| max(a.start, b.start)..min(a.end, b.end))
             .collect::<Vec<_>>();
-        if any(ranges.iter(), |r| r.end < r.start) { None } else { Some(Self(ranges)) }
+        if any(ranges.iter(), |r| r.end <= r.start) { None } else { Some(Self(ranges)) }
     }
 
     pub fn union(&self, other: &Self) -> Self {
         debug_assert_eq!(self.len(), other.len());
         Self::from_ranges_iter(
-            std::iter::zip(self.iter(), other.iter())
+            zip(self.iter(), other.iter())
                 .map(|(a, b)| min(a.start, b.start)..max(a.end, b.end)),
         )
     }
diff --git a/icechunk/src/format/transaction_log.rs b/icechunk/src/format/transaction_log.rs
index 06324b833..2ab20d3d1 100644
--- a/icechunk/src/format/transaction_log.rs
+++ b/icechunk/src/format/transaction_log.rs
@@ -42,7 +42,7 @@ impl TransactionLog {
 
         // these come sorted from the change set
         let updated_chunks = cs
-            .chunk_changes()
+            .changed_chunks()
             .map(|(node_id, chunks)| {
                 let node_id = generated::ObjectId8::new(&node_id.0);
                 let node_id = Some(&node_id);
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index fe27f1e41..38404267b 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -214,27 +214,17 @@ pub enum Overlap {
 pub fn overlaps(us: &ManifestExtents, them: &ManifestExtents) -> Overlap {
     debug_assert!(us.len() == them.len());
 
-    let mut overlaps = vec![];
+    let mut overlap = Overlap::Complete;
     for (a, b) in zip(us.iter(), them.iter()) {
         debug_assert!(a.start <= a.end, "Invalid range: {:?}", a.clone());
         debug_assert!(b.start <= b.end, "Invalid range: {:?}", b.clone());
-
-        if (a.start <= b.start) && (a.end >= b.end) {
-            overlaps.push(Overlap::Complete);
-        } else if (a.end <= b.start) || (a.start >= b.end) {
-            overlaps.push(Overlap::None);
-        } else {
-            overlaps.push(Overlap::Partial)
+        if (a.end <= b.start) || (a.start >= b.end) {
+            return Overlap::None;
+        } else if !((a.start <= b.start) && (a.end >= b.end)) {
+            overlap = Overlap::Partial
         }
     }
-
-    if overlaps.iter().all(|x| x == &Overlap::Complete) {
-        Overlap::Complete
-    } else if overlaps.iter().any(|x| x == &Overlap::None) {
-        Overlap::None
-    } else {
-        Overlap::Partial
-    }
+    overlap
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize)]
@@ -2476,6 +2466,40 @@ mod tests {
         assert_eq!(overlaps(&e1, &e2), Overlap::None);
         assert_eq!(overlaps(&e2, &e1), Overlap::None);
 
+        // asymmetric case
+        let e1 = ManifestExtents::new(
+            vec![0u32, 1, 2].as_slice(),
+            vec![3u32, 4, 6].as_slice(),
+        );
+        let e2 = ManifestExtents::new(
+            vec![2u32, 1, 2].as_slice(),
+            vec![3u32, 4, 6].as_slice(),
+        );
+        let union = ManifestExtents::new(
+            vec![0u32, 1, 2].as_slice(),
+            vec![3u32, 4, 6].as_slice(),
+        );
+        let intersection = ManifestExtents::new(
+            vec![2u32, 1, 2].as_slice(),
+            vec![3u32, 4, 6].as_slice(),
+        );
+        assert_eq!(overlaps(&e1, &e2), Overlap::Complete);
+        assert_eq!(overlaps(&e2, &e1), Overlap::Partial);
+        assert_eq!(e1.union(&e2), union.clone());
+        assert_eq!(e2.union(&e1), union.clone());
+        assert_eq!(e1.intersection(&e2), Some(intersection));
+
+        // empty set
+        let e1 = ManifestExtents::new(
+            vec![0u32, 1, 2].as_slice(),
+            vec![3u32, 4, 6].as_slice(),
+        );
+        let e2 = ManifestExtents::new(
+            vec![2u32, 1, 2].as_slice(),
+            vec![2u32, 4, 6].as_slice(),
+        );
+        assert_eq!(e1.intersection(&e2), None);
+
         // this should create non-overlapping extents
         let splits = ManifestSplits::from_edges(vec![
             vec![0, 10, 20],
diff --git a/icechunk/src/strategies.rs b/icechunk/src/strategies.rs
index 350a7f57f..d458a702c 100644
--- a/icechunk/src/strategies.rs
+++ b/icechunk/src/strategies.rs
@@ -101,7 +101,7 @@ pub fn shapes_and_dims(max_ndim: Option<usize>) -> impl Strategy<Value = ShapeDi
 }
 
 pub fn manifest_extents(ndim: usize) -> impl Strategy<Value = ManifestExtents> {
-    (vec(0u32..1000u32, ndim), vec(0u32..1000u32, ndim)).prop_map(|(start, delta)| {
+    (vec(0u32..1000u32, ndim), vec(1u32..1000u32, ndim)).prop_map(|(start, delta)| {
         let stop = std::iter::zip(start.iter(), delta.iter())
             .map(|(s, d)| s + d)
             .collect::<Vec<_>>();

From c93ea844d280df17e2c5d619d3cd88ce857051bb Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Mon, 9 Jun 2025 14:23:20 -0600
Subject: [PATCH 10/43] Fix distributed writes.

---
 icechunk-python/src/session.rs |  6 +++---
 icechunk/src/change_set.rs     | 25 +++++++++++++++++--------
 icechunk/src/session.rs        |  8 +++++---
 3 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/icechunk-python/src/session.rs b/icechunk-python/src/session.rs
index cf5306dcc..e5606c76a 100644
--- a/icechunk-python/src/session.rs
+++ b/icechunk-python/src/session.rs
@@ -165,14 +165,14 @@ impl PySession {
     pub fn merge(&self, other: &PySession, py: Python<'_>) -> PyResult<()> {
         // This is blocking function, we need to release the Gil
         py.allow_threads(move || {
-            // TODO: Bad clone
-            let changes = other.0.blocking_read().deref().changes().clone();
+            // TODO: bad clone
+            let other = other.0.blocking_read().deref().clone();
 
             pyo3_async_runtimes::tokio::get_runtime().block_on(async move {
                 self.0
                     .write()
                     .await
-                    .merge(changes)
+                    .merge(other)
                     .await
                     .map_err(PyIcechunkStoreError::SessionError)?;
                 Ok(())
diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index 302d1d7fb..71baaa6f8 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -1,5 +1,5 @@
 use std::{
-    collections::{BTreeMap, HashMap, HashSet},
+    collections::{BTreeMap, HashMap, HashSet, btree_map, hash_map},
     iter,
 };
 
@@ -326,14 +326,23 @@ impl ChangeSet {
         self.deleted_groups.extend(other.deleted_groups);
         self.deleted_arrays.extend(other.deleted_arrays);
 
-        for (node, other_chunks) in other.set_chunks.into_iter() {
-            match self.set_chunks.remove(&node) {
-                Some(mut old_value) => {
-                    old_value.extend(other_chunks);
-                    self.set_chunks.insert(node, old_value);
+        for (node, other_splits) in other.set_chunks.into_iter() {
+            // this complicated matching code avoids cloning `other.set_chunks`, which could be quite large
+            match self.set_chunks.entry(node) {
+                btree_map::Entry::Occupied(mut entry) => {
+                    for (extent, their_split) in other_splits.into_iter() {
+                        match entry.get_mut().entry(extent) {
+                            hash_map::Entry::Occupied(mut our_split) => {
+                                our_split.get_mut().extend(their_split);
+                            }
+                            hash_map::Entry::Vacant(entry) => {
+                                entry.insert(their_split);
+                            }
+                        }
+                    }
                 }
-                None => {
-                    self.set_chunks.insert(node, other_chunks);
+                btree_map::Entry::Vacant(entry) => {
+                    entry.insert(other_splits);
                 }
             }
         }
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 38404267b..25c2785b1 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -921,12 +921,14 @@ impl Session {
     }
 
     /// Merge a set of `ChangeSet`s into the repository without committing them
-    #[instrument(skip(self, changes))]
-    pub async fn merge(&mut self, changes: ChangeSet) -> SessionResult<()> {
+    #[instrument(skip(self, other))]
+    pub async fn merge(&mut self, other: Session) -> SessionResult<()> {
         if self.read_only() {
             return Err(SessionErrorKind::ReadOnlySession.into());
         }
-        self.change_set.merge(changes);
+        let Session { splits, change_set, .. } = other;
+        self.splits.extend(splits);
+        self.change_set.merge(change_set);
         Ok(())
     }
 

From 46638dee9ddc716036591b63601eeb2317acec3c Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Mon, 9 Jun 2025 18:55:45 -0600
Subject: [PATCH 11/43] Handle appends

---
 icechunk/src/change_set.rs | 32 +++++++++++++++++++++++++++++++-
 icechunk/src/session.rs    | 10 ++++++----
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index 71baaa6f8..d9130839f 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -103,7 +103,13 @@ impl ChangeSet {
         self.new_arrays.insert(path, (node_id, array_data));
     }
 
-    pub fn update_array(&mut self, node_id: &NodeId, path: &Path, array_data: ArrayData) {
+    pub fn update_array(
+        &mut self,
+        node_id: &NodeId,
+        path: &Path,
+        array_data: ArrayData,
+        new_splits: &ManifestSplits,
+    ) {
         match self.new_arrays.get(path) {
             Some((id, _)) => {
                 debug_assert!(!self.updated_arrays.contains_key(id));
@@ -113,6 +119,30 @@ impl ChangeSet {
                 self.updated_arrays.insert(node_id.clone(), array_data);
             }
         }
+
+        // update existing splits
+        if let Some(manifests) = self.set_chunks.remove(node_id) {
+            let mut new_manifests =
+                HashMap::<ManifestExtents, SplitManifest>::with_capacity(
+                    new_splits.len(),
+                );
+            for (old_extents, chunks) in manifests.into_iter() {
+                // FIXME: these extents had better be compatible
+                // test_cases: increasing size of (multiple) dimensions
+                //             decreasing size of (multiple) dimensions
+                //
+                new_splits
+                    .iter()
+                    .find(|x| {
+                        // case of increased dimension size
+                        old_extents.overlap_with(*x) == Overlap::Complete
+                        // case of decreased dimension size
+                            || (*x).overlap_with(&old_extents) == Overlap::Complete
+                    })
+                    .map(|extents| new_manifests.insert(extents.clone(), chunks));
+            }
+            self.set_chunks.insert(node_id.clone(), new_manifests);
+        }
     }
 
     pub fn update_group(&mut self, node_id: &NodeId, path: &Path, definition: Bytes) {
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 25c2785b1..914330ce1 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -455,14 +455,13 @@ impl Session {
         user_data: Bytes,
     ) -> SessionResult<()> {
         self.get_array(path).await.map(|node| {
-            // Q: What happens if we set a chunk, then change a dimension name, so
-            //   that the split changes.
-            // A: We ignore it. splits are set once for a node in a session, and are never changed.
-            // self.cache_splits(&node.id, path, &shape, &dimension_names);
+            // needed to handle a resize for example.
+            self.cache_splits(&node.id, path, &shape, &dimension_names);
             self.change_set.update_array(
                 &node.id,
                 path,
                 ArrayData { shape, dimension_names, user_data },
+                self.splits.get(&node.id).expect("getting splits should not fail."),
             )
         })
     }
@@ -530,6 +529,9 @@ impl Session {
         shape: &ArrayShape,
         dimension_names: &Option<Vec<DimensionName>>,
     ) {
+        // FIXME: handle conflicts here
+        // Q: What happens if we set a chunk, then change a dimension name, so
+        //    that the split changes.
         let splitting = self.config.manifest().splitting();
         let splits = splitting.get_split_sizes(path, shape, dimension_names);
         self.splits.insert(node_id.clone(), splits);

From ac1d24b41ccfb1248cae1af05a56a9955ca82e71 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Mon, 9 Jun 2025 18:56:48 -0600
Subject: [PATCH 12/43] Refactor overlaps to overlap_with

---
 icechunk/src/change_set.rs      |  2 +-
 icechunk/src/format/manifest.rs | 24 ++++++++++++
 icechunk/src/session.rs         | 69 +++++++++++----------------------
 3 files changed, 48 insertions(+), 47 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index d9130839f..e9df5209f 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -10,7 +10,7 @@ use serde::{Deserialize, Serialize};
 use crate::{
     format::{
         ChunkIndices, NodeId, Path,
-        manifest::{ChunkInfo, ChunkPayload, ManifestExtents, ManifestSplits},
+        manifest::{ChunkInfo, ChunkPayload, ManifestExtents, ManifestSplits, Overlap},
         snapshot::{ArrayShape, DimensionName, NodeData, NodeSnapshot},
     },
     session::{SessionResult, which_extent_and_index},
diff --git a/icechunk/src/format/manifest.rs b/icechunk/src/format/manifest.rs
index 1b707c592..96a4027ae 100644
--- a/icechunk/src/format/manifest.rs
+++ b/icechunk/src/format/manifest.rs
@@ -25,6 +25,13 @@ use super::{
     ChunkId, ChunkIndices, ChunkLength, ChunkOffset, IcechunkResult, ManifestId, NodeId,
 };
 
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum Overlap {
+    Complete,
+    Partial,
+    None,
+}
+
 #[derive(Debug, Clone, Hash, PartialEq, Eq, Serialize, Deserialize)]
 pub struct ManifestExtents(Vec<Range<u32>>);
 
@@ -73,6 +80,23 @@ impl ManifestExtents {
                 .map(|(a, b)| min(a.start, b.start)..max(a.end, b.end)),
         )
     }
+
+    pub fn overlap_with(&self, other: &Self) -> Overlap {
+        // Important: this is not symmetric.
+        debug_assert!(self.len() == other.len());
+
+        let mut overlap = Overlap::Complete;
+        for (a, b) in zip(other.iter(), self.iter()) {
+            debug_assert!(a.start <= a.end, "Invalid range: {:?}", a.clone());
+            debug_assert!(b.start <= b.end, "Invalid range: {:?}", b.clone());
+            if (a.end <= b.start) || (a.start >= b.end) {
+                return Overlap::None;
+            } else if !((a.start <= b.start) && (a.end >= b.end)) {
+                overlap = Overlap::Partial
+            }
+        }
+        overlap
+    }
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 914330ce1..0700e292f 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -3,7 +3,6 @@ use std::{
     collections::{HashMap, HashSet},
     convert::Infallible,
     future::{Future, ready},
-    iter::zip,
     ops::Range,
     pin::Pin,
     sync::Arc,
@@ -33,8 +32,9 @@ use crate::{
         IcechunkFormatErrorKind, ManifestId, NodeId, ObjectId, Path, SnapshotId,
         manifest::{
             ChunkInfo, ChunkPayload, ChunkRef, Manifest, ManifestExtents, ManifestRef,
-            ManifestSplits, VirtualChunkLocation, VirtualChunkRef, VirtualReferenceError,
-            VirtualReferenceErrorKind, uniform_manifest_split_edges,
+            ManifestSplits, Overlap, VirtualChunkLocation, VirtualChunkRef,
+            VirtualReferenceError, VirtualReferenceErrorKind,
+            uniform_manifest_split_edges,
         },
         snapshot::{
             ArrayShape, DimensionName, ManifestFileInfo, NodeData, NodeSnapshot,
@@ -203,30 +203,6 @@ impl ManifestSplits {
     }
 }
 
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub enum Overlap {
-    Complete,
-    Partial,
-    None,
-}
-
-/// Important: this is not symmetric.
-pub fn overlaps(us: &ManifestExtents, them: &ManifestExtents) -> Overlap {
-    debug_assert!(us.len() == them.len());
-
-    let mut overlap = Overlap::Complete;
-    for (a, b) in zip(us.iter(), them.iter()) {
-        debug_assert!(a.start <= a.end, "Invalid range: {:?}", a.clone());
-        debug_assert!(b.start <= b.end, "Invalid range: {:?}", b.clone());
-        if (a.end <= b.start) || (a.start >= b.end) {
-            return Overlap::None;
-        } else if !((a.start <= b.start) && (a.end >= b.end)) {
-            overlap = Overlap::Partial
-        }
-    }
-    overlap
-}
-
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct Session {
     config: RepositoryConfig,
@@ -1298,7 +1274,7 @@ async fn verified_node_chunk_iterator<'a>(
                     futures::stream::iter(manifests)
                         .filter(move |manifest_ref| {
                             futures::future::ready(extent.as_ref().is_none_or(|e| {
-                                overlaps(&manifest_ref.extents, e) != Overlap::None
+                                e.overlap_with(&manifest_ref.extents) != Overlap::None
                             }))
                         })
                         .then(move |manifest_ref| {
@@ -1734,7 +1710,7 @@ impl<'a> FlushProcess<'a> {
                 for old_ref in manifests.iter() {
                     // Remember that the extents written to disk are the `from`:`to` ranges
                     // of populated chunks
-                    match overlaps(&old_ref.extents, extent) {
+                    match extent.overlap_with(&old_ref.extents) {
                         Overlap::Complete => {
                             debug_assert!(on_disk_bbox.is_some());
                             // Just propagate this ref again, no rewriting necessary
@@ -2343,7 +2319,7 @@ mod tests {
     ) {
         prop_assert_eq!(e.intersection(&e), Some(e.clone()));
         prop_assert_eq!(e.union(&e), e.clone());
-        prop_assert_eq!(overlaps(&e, &e), Overlap::Complete);
+        prop_assert_eq!(e.overlap_with(&e), Overlap::Complete);
     }
 
     #[proptest]
@@ -2360,17 +2336,17 @@ mod tests {
         prop_assert_eq!(union.intersection(&e2), Some(e2.clone()));
 
         // order is important for the next 2
-        prop_assert_eq!(overlaps(&union, &e1), Overlap::Complete);
-        prop_assert_eq!(overlaps(&union, &e2), Overlap::Complete);
+        prop_assert_eq!(e1.overlap_with(&union), Overlap::Complete);
+        prop_assert_eq!(e2.overlap_with(&union), Overlap::Complete);
 
         if intersection.is_some() {
             let int = intersection.unwrap();
             let expected = if e1 == e1 { Overlap::Complete } else { Overlap::Partial };
-            prop_assert_eq!(overlaps(&e1, &int), expected.clone());
-            prop_assert_eq!(overlaps(&e2, &int), expected);
+            prop_assert_eq!(int.overlap_with(&e1), expected.clone());
+            prop_assert_eq!(int.overlap_with(&e2), expected);
         } else {
-            prop_assert_eq!(overlaps(&e1, &e2), Overlap::None);
-            prop_assert_eq!(overlaps(&e2, &e1), Overlap::None);
+            prop_assert_eq!(e2.overlap_with(&e1), Overlap::None);
+            prop_assert_eq!(e1.overlap_with(&e2), Overlap::None);
         }
     }
 
@@ -2393,7 +2369,7 @@ mod tests {
         if all(delta_left.iter(), |elem| elem == &0i32)
             && all(delta_right.iter(), |elem| elem == &0i32)
         {
-            prop_assert_eq!(overlaps(&extent1, &extent2), Overlap::Complete);
+            prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::Complete);
         }
 
         let extent2 = ManifestExtents::from_ranges_iter(
@@ -2410,7 +2386,7 @@ mod tests {
             }),
         );
 
-        prop_assert_eq!(overlaps(&extent1, &extent2), Overlap::None);
+        prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::None);
 
         let extent2 = ManifestExtents::from_ranges_iter(
             multizip((
@@ -2425,7 +2401,7 @@ mod tests {
                     ..((extent.end as i32 - width - low) as u32)
             }),
         );
-        prop_assert_eq!(overlaps(&extent1, &extent2), Overlap::None);
+        prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::None);
 
         let extent2 = ManifestExtents::from_ranges_iter(
             multizip((extent1.iter(), delta_left.iter(), delta_right.iter())).map(
@@ -2435,7 +2411,7 @@ mod tests {
                 },
             ),
         );
-        prop_assert_eq!(overlaps(&extent1, &extent2), Overlap::Partial);
+        prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::Partial);
     }
 
     #[icechunk_macros::test]
@@ -2455,7 +2431,7 @@ mod tests {
             vec![12u32, 4, 6].as_slice(),
         );
 
-        assert_eq!(overlaps(&e1, &e2), Overlap::None);
+        assert_eq!(e2.overlap_with(&e1), Overlap::None);
         assert_eq!(e1.intersection(&e2), None);
         assert_eq!(e1.union(&e2), union);
 
@@ -2467,8 +2443,8 @@ mod tests {
             vec![2u32, 1, 2].as_slice(),
             vec![42u32, 4, 6].as_slice(),
         );
-        assert_eq!(overlaps(&e1, &e2), Overlap::None);
-        assert_eq!(overlaps(&e2, &e1), Overlap::None);
+        assert_eq!(e2.overlap_with(&e1), Overlap::None);
+        assert_eq!(e1.overlap_with(&e2), Overlap::None);
 
         // asymmetric case
         let e1 = ManifestExtents::new(
@@ -2487,8 +2463,8 @@ mod tests {
             vec![2u32, 1, 2].as_slice(),
             vec![3u32, 4, 6].as_slice(),
         );
-        assert_eq!(overlaps(&e1, &e2), Overlap::Complete);
-        assert_eq!(overlaps(&e2, &e1), Overlap::Partial);
+        assert_eq!(e2.overlap_with(&e1), Overlap::Complete);
+        assert_eq!(e1.overlap_with(&e2), Overlap::Partial);
         assert_eq!(e1.union(&e2), union.clone());
         assert_eq!(e2.union(&e1), union.clone());
         assert_eq!(e1.intersection(&e2), Some(intersection));
@@ -2511,7 +2487,8 @@ mod tests {
             vec![0, 21, 22],
         ]);
         for vec in splits.iter().combinations(2) {
-            assert_eq!(overlaps(vec[0], vec[1]), Overlap::None)
+            assert_eq!(vec[0].overlap_with(vec[1]), Overlap::None);
+            assert_eq!(vec[1].overlap_with(vec[0]), Overlap::None);
         }
 
         Ok(())

From dae0d6005548ea51acfc6eab6ca6168cbc1b4593 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Mon, 9 Jun 2025 18:56:48 -0600
Subject: [PATCH 13/43] Refactor overlaps to overlap_with

---
 icechunk/src/change_set.rs | 4 ++--
 icechunk/src/session.rs    | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index e9df5209f..981c77eac 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -135,9 +135,9 @@ impl ChangeSet {
                     .iter()
                     .find(|x| {
                         // case of increased dimension size
-                        old_extents.overlap_with(*x) == Overlap::Complete
+                        old_extents.overlap_with(x) == Overlap::Complete
                         // case of decreased dimension size
-                            || (*x).overlap_with(&old_extents) == Overlap::Complete
+                            || x.overlap_with(&old_extents) == Overlap::Complete
                     })
                     .map(|extents| new_manifests.insert(extents.clone(), chunks));
             }
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 0700e292f..b444e249a 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -437,6 +437,7 @@ impl Session {
                 &node.id,
                 path,
                 ArrayData { shape, dimension_names, user_data },
+                #[allow(clippy::expect_used)]
                 self.splits.get(&node.id).expect("getting splits should not fail."),
             )
         })

From f841e3c026409eb5603e19e575ad163ceab08974 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Tue, 10 Jun 2025 10:32:24 -0600
Subject: [PATCH 14/43] Cleanup

---
 icechunk/src/change_set.rs | 44 ++++++++--------------
 icechunk/src/session.rs    | 77 +++++++++++++++++---------------------
 2 files changed, 51 insertions(+), 70 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index 981c77eac..9128b163d 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -13,7 +13,7 @@ use crate::{
         manifest::{ChunkInfo, ChunkPayload, ManifestExtents, ManifestSplits, Overlap},
         snapshot::{ArrayShape, DimensionName, NodeData, NodeSnapshot},
     },
-    session::{SessionResult, which_extent_and_index},
+    session::{SessionResult, find_coord},
 };
 
 #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
@@ -198,7 +198,7 @@ impl ChangeSet {
         splits: &ManifestSplits,
     ) {
         #[allow(clippy::expect_used)]
-        let (_, extent) = splits.which_extent_and_index(&coord).expect("logic bug. Trying to set chunk ref but can't find the appropriate split manifest.");
+        let extent = splits.find(&coord).expect("logic bug. Trying to set chunk ref but can't find the appropriate split manifest.");
         // this implementation makes delete idempotent
         // it allows deleting a deleted chunk by repeatedly setting None.
         self.set_chunks
@@ -225,13 +225,10 @@ impl ChangeSet {
         node_id: &NodeId,
         coords: &ChunkIndices,
     ) -> Option<&Option<ChunkPayload>> {
-        if let Some(node_chunks) = self.set_chunks.get(node_id) {
-            which_extent_and_index(node_chunks.keys(), coords).ok().and_then(
-                |(_, extent)| node_chunks.get(&extent).and_then(|s| s.get(coords)),
-            )
-        } else {
-            None
-        }
+        self.set_chunks.get(node_id).and_then(|node_chunks| {
+            find_coord(node_chunks.keys(), coords)
+                .and_then(|extent| node_chunks.get(&extent).and_then(|s| s.get(coords)))
+        })
     }
 
     /// Drop the updated chunk references for the node.
@@ -330,24 +327,13 @@ impl ChangeSet {
         self.new_arrays.iter().map(|(path, (node_id, _))| (path, node_id))
     }
 
-    // pub fn take_chunks(
-    //     &mut self,
-    // ) -> BTreeMap<NodeId, BTreeMap<ChunkIndices, Option<ChunkPayload>>> {
-    //     take(&mut self.set_chunks)
-    // }
-
-    // pub fn set_chunks(
-    //     &mut self,
-    //     chunks: BTreeMap<NodeId, BTreeMap<ChunkIndices, Option<ChunkPayload>>>,
-    // ) {
-    //     self.set_chunks = chunks
-    // }
-
     /// Merge this ChangeSet with `other`.
     ///
     /// Results of the merge are applied to `self`. Changes present in `other` take precedence over
     /// `self` changes.
     pub fn merge(&mut self, other: ChangeSet) {
+        // FIXME: this should detect conflict, for example, if different writers added on the same
+        // path, different objects, or if the same path is added and deleted, etc.
         // TODO: optimize
         self.new_groups.extend(other.new_groups);
         self.new_arrays.extend(other.new_arrays);
@@ -539,13 +525,13 @@ mod tests {
         );
         assert_eq!(None, change_set.new_arrays_chunk_iterator().next());
 
-        let splits = ManifestSplits::from_edges(vec![vec![0, 10], vec![0, 10]]);
+        let splits1 = ManifestSplits::from_edges(vec![vec![0, 10], vec![0, 10]]);
 
         change_set.set_chunk_ref(
             node_id1.clone(),
             ChunkIndices(vec![0, 1]),
             None,
-            &splits,
+            &splits1,
         );
         assert_eq!(None, change_set.new_arrays_chunk_iterator().next());
 
@@ -553,25 +539,27 @@ mod tests {
             node_id1.clone(),
             ChunkIndices(vec![1, 0]),
             Some(ChunkPayload::Inline("bar1".into())),
-            &splits,
+            &splits1,
         );
         change_set.set_chunk_ref(
             node_id1.clone(),
             ChunkIndices(vec![1, 1]),
             Some(ChunkPayload::Inline("bar2".into())),
-            &splits,
+            &splits1,
         );
+
+        let splits2 = ManifestSplits::from_edges(vec![vec![0, 10]]);
         change_set.set_chunk_ref(
             node_id2.clone(),
             ChunkIndices(vec![0]),
             Some(ChunkPayload::Inline("baz1".into())),
-            &splits,
+            &splits2,
         );
         change_set.set_chunk_ref(
             node_id2.clone(),
             ChunkIndices(vec![1]),
             Some(ChunkPayload::Inline("baz2".into())),
-            &splits,
+            &splits2,
         );
 
         {
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index b444e249a..9af368e40 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -13,7 +13,7 @@ use bytes::Bytes;
 use chrono::{DateTime, Utc};
 use err_into::ErrorInto;
 use futures::{FutureExt, Stream, StreamExt, TryStreamExt, future::Either, stream};
-use itertools::{Itertools as _, repeat_n};
+use itertools::{Itertools as _, enumerate, repeat_n};
 use regex::bytes::Regex;
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
@@ -164,10 +164,12 @@ pub type SessionResult<T> = Result<T, SessionError>;
 // Returns the index of split_range that includes ChunkIndices
 // This can be used at write time to split manifests based on the config
 // and at read time to choose which manifest to query for chunk payload
-pub fn which_extent_and_index<'a>(
-    iter: impl Iterator<Item = &'a ManifestExtents>,
-    coord: &ChunkIndices,
-) -> SessionResult<(usize, ManifestExtents)> {
+/// It is useful to have this act on an iterator (e.g. get_chunk_ref)
+/// The find method on ManifestSplits is simply a helper.
+pub fn find_coord<'a, I>(mut iter: I, coord: &'a ChunkIndices) -> Option<ManifestExtents>
+where
+    I: Iterator<Item = &'a ManifestExtents>,
+{
     // split_range[i] must bound ChunkIndices
     // 0 <= return value <= split_range.len()
     // it is possible that split_range does not include a coord. say we have 2x2 split grid
@@ -176,30 +178,27 @@ pub fn which_extent_and_index<'a>(
     // Since split_range need not form a regular grid, we must iterate through and find the first result.
     // ManifestExtents in split_range MUST NOT overlap with each other. How do we ensure this?
     // ndim must be the same
-    // debug_assert_eq!(coord.0.len(), split_range[0].len());
-    // FIXME: could optimize for unbounded single manifest
     // Note: I don't think we can distinguish between out of bounds index for the array
     //       and an index that is part of a split that hasn't been written yet.
-    iter.enumerate()
-        .find(|(_, e)| e.contains(coord.0.as_slice()))
-        .map(|(i, e)| (i, e.clone()))
-        .ok_or(
-            SessionErrorKind::InvalidIndexForSplitManifests { coords: coord.clone() }
-                .into(),
-        )
+    iter.find(|e| e.contains(coord.0.as_slice())).cloned()
+}
+
+pub fn position_coord<'a, I>(iter: I, coord: &'a ChunkIndices) -> Option<usize>
+where
+    I: Iterator<Item = &'a ManifestExtents>,
+{
+    enumerate(iter).find(|(_, e)| e.contains(coord.0.as_slice())).map(|x| x.0)
 }
 
 impl ManifestSplits {
-    pub fn which_extent_and_index(
-        &self,
-        coord: &ChunkIndices,
-    ) -> SessionResult<(usize, ManifestExtents)> {
-        which_extent_and_index(self.iter(), coord)
+    pub fn find(&self, coord: &ChunkIndices) -> Option<ManifestExtents> {
+        debug_assert_eq!(coord.0.len(), self.0[0].len());
+        find_coord(self.iter(), coord)
     }
 
-    #[cfg(test)]
-    pub fn which_index(&self, coord: &ChunkIndices) -> SessionResult<usize> {
-        which_extent_and_index(self.iter(), coord).map(|(i, _)| i)
+    pub fn position(&self, coord: &ChunkIndices) -> Option<usize> {
+        debug_assert_eq!(coord.0.len(), self.0[0].len());
+        position_coord(self.iter(), coord)
     }
 }
 
@@ -803,16 +802,14 @@ impl Session {
             // in the changeset, return None to Zarr.
             return Ok(None);
         }
-        let splits = ManifestSplits::from_extents(
-            manifests.iter().map(|m| m.extents.clone()).collect(),
-        );
-        let (index, _) = match splits.which_extent_and_index(coords) {
-            Ok(index) => index,
+
+        let index = match position_coord(manifests.iter().map(|m| &m.extents), coords) {
+            Some(index) => index,
             // for an invalid coordinate, we bail.
             // This happens for two cases:
             // (1) the "coords" is out-of-range for the array shape
             // (2) the "coords" belongs to a shard that hasn't been written yet.
-            Err(_) => return Ok(None),
+            None => return Ok(None),
         };
 
         let manifest = self.fetch_manifest(&manifests[index].object_id).await?;
@@ -2530,16 +2527,16 @@ mod tests {
     async fn test_which_split() -> Result<(), Box<dyn Error>> {
         let splits = ManifestSplits::from_edges(vec![vec![0, 10, 20]]);
 
-        assert_eq!(splits.which_index(&ChunkIndices(vec![1])).unwrap(), 0);
-        assert_eq!(splits.which_index(&ChunkIndices(vec![11])).unwrap(), 1);
+        assert_eq!(splits.position(&ChunkIndices(vec![1])), Some(0));
+        assert_eq!(splits.position(&ChunkIndices(vec![11])), Some(1));
 
         let edges = vec![vec![0, 10, 20], vec![0, 10, 20]];
 
         let splits = ManifestSplits::from_edges(edges);
-        assert_eq!(splits.which_index(&ChunkIndices(vec![1, 1])).unwrap(), 0);
-        assert_eq!(splits.which_index(&ChunkIndices(vec![1, 10])).unwrap(), 1);
-        assert_eq!(splits.which_index(&ChunkIndices(vec![1, 11])).unwrap(), 1);
-        assert!(splits.which_index(&ChunkIndices(vec![21, 21])).is_err());
+        assert_eq!(splits.position(&ChunkIndices(vec![1, 1])), Some(0));
+        assert_eq!(splits.position(&ChunkIndices(vec![1, 10])), Some(1));
+        assert_eq!(splits.position(&ChunkIndices(vec![1, 11])), Some(1));
+        assert!(splits.position(&ChunkIndices(vec![21, 21])).is_none());
 
         Ok(())
     }
@@ -2791,17 +2788,13 @@ mod tests {
         // set old array chunk and check them
         let data = Bytes::copy_from_slice(b"foo".repeat(512).as_slice());
         let payload = ds.get_chunk_writer()(data.clone()).await?;
-        ds.set_chunk_ref(array1_path.clone(), ChunkIndices(vec![0, 0, 0]), Some(payload))
+        ds.set_chunk_ref(array1_path.clone(), ChunkIndices(vec![0]), Some(payload))
             .await?;
 
         let chunk = get_chunk(
-            ds.get_chunk_reader(
-                &array1_path,
-                &ChunkIndices(vec![0, 0, 0]),
-                &ByteRange::ALL,
-            )
-            .await
-            .unwrap(),
+            ds.get_chunk_reader(&array1_path, &ChunkIndices(vec![0]), &ByteRange::ALL)
+                .await
+                .unwrap(),
         )
         .await?;
         assert_eq!(chunk, Some(data));

From 839fdc341c0b14f6513caa4eafeec1fbb51aa4ee Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Tue, 10 Jun 2025 17:25:09 -0600
Subject: [PATCH 15/43] cleanup more

---
 icechunk/src/change_set.rs | 46 +++++++++++++-------------------------
 1 file changed, 15 insertions(+), 31 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index 9128b163d..dbbe0b3ae 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -1,5 +1,5 @@
 use std::{
-    collections::{BTreeMap, HashMap, HashSet, btree_map, hash_map},
+    collections::{BTreeMap, HashMap, HashSet},
     iter,
 };
 
@@ -203,21 +203,15 @@ impl ChangeSet {
         // it allows deleting a deleted chunk by repeatedly setting None.
         self.set_chunks
             .entry(node_id.clone())
-            .and_modify(|h| {
-                h.entry(extent.clone()).or_default().insert(coord.clone(), data.clone());
-            })
             .or_insert_with(|| {
-                let mut h = HashMap::<
+                HashMap::<
                     ManifestExtents,
                     BTreeMap<ChunkIndices, Option<ChunkPayload>>,
-                >::with_capacity(splits.len());
-                h.entry(extent.clone())
-                    // TODO: this is duplicative. I can't use `or_default` because it's
-                    // nice to create the HashMap using `with_capacity`
-                    .or_default()
-                    .insert(coord, data);
-                h
-            });
+                >::with_capacity(splits.len())
+            })
+            .entry(extent.clone())
+            .or_default()
+            .insert(coord.clone(), data.clone());
     }
 
     pub fn get_chunk_ref(
@@ -342,24 +336,14 @@ impl ChangeSet {
         self.deleted_groups.extend(other.deleted_groups);
         self.deleted_arrays.extend(other.deleted_arrays);
 
-        for (node, other_splits) in other.set_chunks.into_iter() {
-            // this complicated matching code avoids cloning `other.set_chunks`, which could be quite large
-            match self.set_chunks.entry(node) {
-                btree_map::Entry::Occupied(mut entry) => {
-                    for (extent, their_split) in other_splits.into_iter() {
-                        match entry.get_mut().entry(extent) {
-                            hash_map::Entry::Occupied(mut our_split) => {
-                                our_split.get_mut().extend(their_split);
-                            }
-                            hash_map::Entry::Vacant(entry) => {
-                                entry.insert(their_split);
-                            }
-                        }
-                    }
-                }
-                btree_map::Entry::Vacant(entry) => {
-                    entry.insert(other_splits);
-                }
+        for (node, other_splits) in other.set_chunks {
+            let manifests = self.set_chunks.entry(node).or_insert_with(|| {
+                HashMap::<ManifestExtents, SplitManifest>::with_capacity(
+                    other_splits.len(),
+                )
+            });
+            for (extent, their_manifest) in other_splits {
+                manifests.entry(extent).or_default().extend(their_manifest)
             }
         }
     }

From 35934a97b63b7a5f47df34b96638e2c4d28ba071 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Tue, 10 Jun 2025 17:27:26 -0600
Subject: [PATCH 16/43] more tests

---
 .../tests/test_zarr/test_stateful.py          | 23 ++++++--
 icechunk/src/change_set.rs                    | 36 ++++++++-----
 icechunk/src/format/manifest.rs               |  7 ++-
 icechunk/src/session.rs                       | 54 +++++++++++++++++--
 4 files changed, 97 insertions(+), 23 deletions(-)

diff --git a/icechunk-python/tests/test_zarr/test_stateful.py b/icechunk-python/tests/test_zarr/test_stateful.py
index a3477848f..dc63d8422 100644
--- a/icechunk-python/tests/test_zarr/test_stateful.py
+++ b/icechunk-python/tests/test_zarr/test_stateful.py
@@ -14,8 +14,11 @@
 )
 
 import zarr
+import logging
 from icechunk import Repository, in_memory_storage
 from zarr.core.buffer import default_buffer_prototype
+import hypothesis.extra.numpy as npst
+from zarr.storage import LoggingStore
 from zarr.testing.stateful import ZarrHierarchyStateMachine
 from zarr.testing.strategies import (
     node_names,
@@ -44,10 +47,10 @@ def chunk_paths(
 class ModifiedZarrHierarchyStateMachine(ZarrHierarchyStateMachine):
     def __init__(self, repo: Repository) -> None:
         self.repo = repo
-        store = repo.writable_session("main").store
+        store = LoggingStore(repo.writable_session("main").store, log_level=logging.INFO + 1)
         super().__init__(store)
 
-    @precondition(lambda self: self.store.session.has_uncommitted_changes)
+    @precondition(lambda self: self.store._store.session.has_uncommitted_changes)
     @rule(data=st.data())
     def commit_with_check(self, data) -> None:
         note("committing and checking list_prefix")
@@ -57,9 +60,9 @@ def commit_with_check(self, data) -> None:
         get_before = self._sync(self.store.get(path, prototype=PROTOTYPE))
         assert get_before
 
-        self.store.session.commit("foo")
+        self.store._store.session.commit("foo")
 
-        self.store = self.repo.writable_session("main").store
+        self.store = LoggingStore(self.repo.writable_session("main").store)
 
         lsafter = sorted(self._sync_iter(self.store.list_prefix("")))
         get_after = self._sync(self.store.get(path, prototype=PROTOTYPE))
@@ -182,6 +185,18 @@ def delete_chunk(self, data) -> None:
         self._sync(self.model.delete(path))
         self._sync(self.store.delete(path))
 
+    @precondition(lambda self: bool(self.all_arrays))
+    @rule(data=st.data())
+    def resize_array(self, data) -> None:
+        array = data.draw(st.sampled_from(sorted(self.all_arrays)))
+        model_array = zarr.open_array(path=array, store=self.model)
+        store_array = zarr.open_array(path=array, store=self.store)
+        ndim = model_array.ndim
+        new_shape = data.draw(npst.array_shapes(max_dims=ndim, min_dims=ndim, min_side=1))
+        note(f"resizing array from {model_array.shape} to {new_shape}")
+        model_array.resize(new_shape)
+        store_array.resize(new_shape)
+
     @precondition(lambda self: bool(self.all_arrays) or bool(self.all_groups))
     @rule(data=st.data())
     def delete_dir(self, data) -> None:
diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index dbbe0b3ae..40ed9facb 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -126,20 +126,28 @@ impl ChangeSet {
                 HashMap::<ManifestExtents, SplitManifest>::with_capacity(
                     new_splits.len(),
                 );
-            for (old_extents, chunks) in manifests.into_iter() {
-                // FIXME: these extents had better be compatible
-                // test_cases: increasing size of (multiple) dimensions
-                //             decreasing size of (multiple) dimensions
-                //
-                new_splits
-                    .iter()
-                    .find(|x| {
-                        // case of increased dimension size
-                        old_extents.overlap_with(x) == Overlap::Complete
-                        // case of decreased dimension size
-                            || x.overlap_with(&old_extents) == Overlap::Complete
-                    })
-                    .map(|extents| new_manifests.insert(extents.clone(), chunks));
+            for (old_extents, mut chunks) in manifests.into_iter() {
+                for new_extents in new_splits.iter() {
+                    if old_extents.overlap_with(new_extents) == Overlap::None {
+                        continue;
+                    }
+
+                    // TODO: replace with `BTreeMap.drain_filter` after it is stable.
+                    let mut extracted =
+                        BTreeMap::<ChunkIndices, Option<ChunkPayload>>::new();
+                    chunks.retain(|coord, payload| {
+                        if new_extents.contains(coord.0.as_slice()) {
+                            extracted.insert(coord.clone(), payload.clone());
+                            false
+                        } else {
+                            true
+                        }
+                    });
+                    new_manifests
+                        .entry(new_extents.clone())
+                        .or_default()
+                        .extend(extracted);
+                }
             }
             self.set_chunks.insert(node_id.clone(), new_manifests);
         }
diff --git a/icechunk/src/format/manifest.rs b/icechunk/src/format/manifest.rs
index 96a4027ae..9e0ae5b55 100644
--- a/icechunk/src/format/manifest.rs
+++ b/icechunk/src/format/manifest.rs
@@ -83,7 +83,12 @@ impl ManifestExtents {
 
     pub fn overlap_with(&self, other: &Self) -> Overlap {
         // Important: this is not symmetric.
-        debug_assert!(self.len() == other.len());
+        debug_assert!(
+            self.len() == other.len(),
+            "Length mismatch: self = {:?}, other = {:?}",
+            &self,
+            &other
+        );
 
         let mut overlap = Overlap::Complete;
         for (a, b) in zip(other.iter(), self.iter()) {
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 9af368e40..f8a809953 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -2772,13 +2772,13 @@ mod tests {
 
         let array_def3 = Bytes::from_static(br#"{"this":"more arrays"}"#);
         ds.update_array(
-            &array1_path.clone(),
+            &new_array_path.clone(),
             shape3.clone(),
             dimension_names3.clone(),
             array_def3.clone(),
         )
         .await?;
-        let node = ds.get_node(&array1_path).await;
+        let node = ds.get_node(&new_array_path).await;
         if let Ok(NodeSnapshot { node_data: NodeData::Array { shape, .. }, .. }) = &node {
             assert_eq!(shape, &shape3);
         } else {
@@ -2788,11 +2788,57 @@ mod tests {
         // set old array chunk and check them
         let data = Bytes::copy_from_slice(b"foo".repeat(512).as_slice());
         let payload = ds.get_chunk_writer()(data.clone()).await?;
-        ds.set_chunk_ref(array1_path.clone(), ChunkIndices(vec![0]), Some(payload))
+        ds.set_chunk_ref(new_array_path.clone(), ChunkIndices(vec![0]), Some(payload))
             .await?;
 
         let chunk = get_chunk(
-            ds.get_chunk_reader(&array1_path, &ChunkIndices(vec![0]), &ByteRange::ALL)
+            ds.get_chunk_reader(&new_array_path, &ChunkIndices(vec![0]), &ByteRange::ALL)
+                .await
+                .unwrap(),
+        )
+        .await?;
+        assert_eq!(chunk, Some(data));
+
+        // reduce size of dimension
+        // // update old array zarr metadata and check it
+        let shape4 = ArrayShape::new(vec![(6, 3)]).unwrap();
+        let array_def3 = Bytes::from_static(br#"{"this":"more arrays"}"#);
+        ds.update_array(
+            &new_array_path.clone(),
+            shape4.clone(),
+            dimension_names3.clone(),
+            array_def3.clone(),
+        )
+        .await?;
+        let node = ds.get_node(&new_array_path).await;
+        if let Ok(NodeSnapshot { node_data: NodeData::Array { shape, .. }, .. }) = &node {
+            assert_eq!(shape, &shape4);
+        } else {
+            panic!("Failed to update zarr metadata");
+        }
+
+        // set old array chunk and check them
+        let data = Bytes::copy_from_slice(b"old".repeat(512).as_slice());
+        let payload = ds.get_chunk_writer()(data.clone()).await?;
+        ds.set_chunk_ref(new_array_path.clone(), ChunkIndices(vec![0]), Some(payload))
+            .await?;
+        let data = Bytes::copy_from_slice(b"new".repeat(512).as_slice());
+        let payload = ds.get_chunk_writer()(data.clone()).await?;
+        ds.set_chunk_ref(new_array_path.clone(), ChunkIndices(vec![1]), Some(payload))
+            .await?;
+
+        let chunk = get_chunk(
+            ds.get_chunk_reader(&new_array_path, &ChunkIndices(vec![1]), &ByteRange::ALL)
+                .await
+                .unwrap(),
+        )
+        .await?;
+        assert_eq!(chunk, Some(data.clone()));
+
+        ds.commit("commit", Some(SnapshotProperties::default())).await?;
+
+        let chunk = get_chunk(
+            ds.get_chunk_reader(&new_array_path, &ChunkIndices(vec![1]), &ByteRange::ALL)
                 .await
                 .unwrap(),
         )

From 4b3cde397f5ba8e8383642069f1892b85dd8111d Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Tue, 10 Jun 2025 19:26:46 -0600
Subject: [PATCH 17/43] more cleanup

---
 icechunk/src/change_set.rs | 13 ++++++-------
 icechunk/src/session.rs    |  8 ++------
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index 40ed9facb..7337fcc77 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -136,12 +136,11 @@ impl ChangeSet {
                     let mut extracted =
                         BTreeMap::<ChunkIndices, Option<ChunkPayload>>::new();
                     chunks.retain(|coord, payload| {
-                        if new_extents.contains(coord.0.as_slice()) {
+                        let cond = new_extents.contains(coord.0.as_slice());
+                        if cond {
                             extracted.insert(coord.clone(), payload.clone());
-                            false
-                        } else {
-                            true
                         }
+                        !cond
                     });
                     new_manifests
                         .entry(new_extents.clone())
@@ -295,17 +294,17 @@ impl ChangeSet {
         )
     }
 
-    pub fn array_manifests_iterator(
+    pub fn modified_manifest_extents_iterator(
         &self,
         node_id: &NodeId,
         node_path: &Path,
-    ) -> impl Iterator<Item = (&ManifestExtents, &SplitManifest)> + use<'_> {
+    ) -> impl Iterator<Item = &ManifestExtents> + use<'_> {
         if self.is_deleted(node_path, node_id) {
             return Either::Left(iter::empty());
         }
         match self.set_chunks.get(node_id) {
             None => Either::Left(iter::empty()),
-            Some(h) => Either::Right(h.iter()),
+            Some(h) => Either::Right(h.keys()),
         }
     }
 
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index f8a809953..da2d56e23 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -1620,10 +1620,7 @@ impl<'a> FlushProcess<'a> {
         refs: HashMap<ManifestExtents, ManifestRef>,
     ) -> SessionResult<()> {
         for ref_ in refs.into_values() {
-            self.manifest_refs
-                .entry(node_id.clone())
-                .and_modify(|v| v.push(ref_.clone()))
-                .or_insert_with(|| vec![ref_]);
+            self.manifest_refs.entry(node_id.clone()).or_default().push(ref_);
         }
         Ok(())
     }
@@ -1682,8 +1679,7 @@ impl<'a> FlushProcess<'a> {
 
         let modified_splits = self
             .change_set
-            .array_manifests_iterator(&node.id, &node.path)
-            .map(|(extents, _)| extents)
+            .modified_manifest_extents_iterator(&node.id, &node.path)
             .collect::<HashSet<_>>();
 
         // FIXME: there is an invariant here

From b9d3ec825d7e10475f1d1a89e8b81dc9197ab397 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Wed, 11 Jun 2025 12:57:55 -0600
Subject: [PATCH 18/43] note

---
 icechunk/src/change_set.rs | 1 +
 icechunk/src/session.rs    | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index 7337fcc77..edd8dc9ae 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -69,6 +69,7 @@ impl ChangeSet {
     }
 
     pub fn arrays_with_chunk_changes(&self) -> impl Iterator<Item = &NodeId> {
+        // FIXME: needs test for session with only chunk deletes on existing nodes
         self.set_chunks.keys()
     }
 
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index da2d56e23..a5cbef38a 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -1171,6 +1171,10 @@ async fn updated_chunk_iterator<'a>(
     let snapshot = asset_manager.fetch_snapshot(snapshot_id).await?;
     let nodes = futures::stream::iter(snapshot.iter_arc());
     let res = nodes.and_then(move |node| async move {
+        // Note: Confusingly, these NodeSnapshot instances have the metadata stored in the snapshot.
+        // We have not applied any changeset updates. At the moment, the downstream code only
+        // use node.id so there is no need to update yet.
+
         Ok(updated_node_chunks_iterator(
             asset_manager,
             change_set,

From 75d264a8d250241cf4168a694ecffb87476cb1c2 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Wed, 11 Jun 2025 12:58:54 -0600
Subject: [PATCH 19/43] Track _all_ deleted chunks separately

---
 icechunk/src/change_set.rs | 99 +++++++++++++++++++++++++++++---------
 1 file changed, 76 insertions(+), 23 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index edd8dc9ae..2fa233f64 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -31,6 +31,9 @@ pub struct ChangeSet {
     updated_arrays: HashMap<NodeId, ArrayData>,
     updated_groups: HashMap<NodeId, Bytes>,
     set_chunks: BTreeMap<NodeId, HashMap<ManifestExtents, SplitManifest>>,
+    // we track deleted chunk indices separately to handle the case when resizing
+    // an array changes the splits, and we might lose record of chunk deletes.
+    deleted_chunks: HashMap<NodeId, HashSet<ChunkIndices>>,
     deleted_groups: HashSet<(Path, NodeId)>,
     deleted_arrays: HashSet<(Path, NodeId)>,
 }
@@ -60,12 +63,25 @@ impl ChangeSet {
         &self,
     ) -> impl Iterator<Item = (&NodeId, impl Iterator<Item = &ChunkIndices>)> {
         self.set_chunks.iter().map(|(node_id, split_map)| {
-            (node_id, split_map.values().flat_map(|x| x.keys()))
+            let deletes = self
+                .deleted_chunks
+                .get(node_id)
+                .map(|x| Either::Right(x.iter()))
+                .or_else(|| Some(Either::Left(iter::empty())))
+                .unwrap();
+            (
+                node_id,
+                Either::<iter::Empty<&ChunkIndices>, _>::Right(
+                    split_map.values().flat_map(|x| x.keys()),
+                )
+                .chain(deletes),
+            )
         })
     }
 
     pub fn has_chunk_changes(&self, node: &NodeId) -> bool {
         self.set_chunks.get(node).map(|m| !m.is_empty()).unwrap_or(false)
+            || self.deleted_chunks.get(node).map(|m| !m.is_empty()).unwrap_or(false)
     }
 
     pub fn arrays_with_chunk_changes(&self) -> impl Iterator<Item = &NodeId> {
@@ -205,21 +221,43 @@ impl ChangeSet {
         data: Option<ChunkPayload>,
         splits: &ManifestSplits,
     ) {
-        #[allow(clippy::expect_used)]
-        let extent = splits.find(&coord).expect("logic bug. Trying to set chunk ref but can't find the appropriate split manifest.");
-        // this implementation makes delete idempotent
-        // it allows deleting a deleted chunk by repeatedly setting None.
-        self.set_chunks
-            .entry(node_id.clone())
-            .or_insert_with(|| {
-                HashMap::<
-                    ManifestExtents,
-                    BTreeMap<ChunkIndices, Option<ChunkPayload>>,
-                >::with_capacity(splits.len())
-            })
-            .entry(extent.clone())
-            .or_default()
-            .insert(coord.clone(), data.clone());
+        // deletes must always be recorded, since the chunk might exist in the on-disk manifest
+        let deletes = self.deleted_chunks.entry(node_id.clone()).or_default();
+        let extent = splits.find(&coord);
+        match data {
+            Some(payload) => {
+                deletes.remove(&coord);
+                self.set_chunks
+                    .entry(node_id)
+                    .or_insert_with(|| {
+                        HashMap::<ManifestExtents, SplitManifest>::with_capacity(
+                            splits.len(),
+                        )
+                    })
+                    .entry(extent.expect("grabbing this extent should succeed"))
+                    // at this point, we have valid data so we must always insert the ref
+                    .or_default()
+                    .insert(coord.clone(), Some(payload));
+            }
+            None => {
+                // this implementation makes delete idempotent
+                // it allows deleting a deleted chunk by repeatedly setting None.
+                deletes.insert(coord.clone());
+                if let Some(extent) = extent {
+                    // both uses of `or_default` are a bit too clever.
+                    // I am making sure this extent is present in the map,
+                    // so that `modified_manifest_extents_iterator` picks it up
+                    self.set_chunks
+                        .entry(node_id)
+                        .or_default()
+                        .entry(extent)
+                        .and_modify(|manifest| {
+                            manifest.remove(&coord);
+                        })
+                        .or_default();
+                };
+            }
+        }
     }
 
     pub fn get_chunk_ref(
@@ -227,9 +265,16 @@ impl ChangeSet {
         node_id: &NodeId,
         coords: &ChunkIndices,
     ) -> Option<&Option<ChunkPayload>> {
-        self.set_chunks.get(node_id).and_then(|node_chunks| {
-            find_coord(node_chunks.keys(), coords)
-                .and_then(|extent| node_chunks.get(&extent).and_then(|s| s.get(coords)))
+        self.deleted_chunks.get(node_id).and_then(|deletes| {
+            if deletes.contains(coords) {
+                Some(&None)
+            } else {
+                self.set_chunks.get(node_id).and_then(|node_chunks| {
+                    find_coord(node_chunks.keys(), coords).and_then(|extent| {
+                        node_chunks.get(&extent).and_then(|s| s.get(coords))
+                    })
+                })
+            }
         })
     }
 
@@ -258,14 +303,22 @@ impl ChangeSet {
         }
         match self.set_chunks.get(node_id) {
             None => Either::Left(iter::empty()),
-            Some(h) => Either::Right(
-                h.iter()
+            Some(h) => {
+                let set_chunks = h
+                    .iter()
                     // FIXME: review this
                     .filter(move |(manifest_extent, _)| {
                         extent.is_none() || Some(*manifest_extent) == extent.as_ref()
                     })
-                    .flat_map(|(_, manifest)| manifest.iter()),
-            ),
+                    .flat_map(|(_, manifest)| manifest.iter());
+                let deleted_chunks = match self.deleted_chunks.get(node_id) {
+                    Some(deletes) => Either::Right(
+                        deletes.iter().map(|coord| (coord, &None::<ChunkPayload>)),
+                    ),
+                    None => Either::Left(iter::empty()),
+                };
+                Either::Right(set_chunks.chain(deleted_chunks))
+            }
         }
     }
 

From f88c0f5021214b3280c1f1286863914fe0ba903d Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Wed, 11 Jun 2025 12:59:30 -0600
Subject: [PATCH 20/43] Revert "Track _all_ deleted chunks separately"

This reverts commit a6b14e76b28d1919e05db9da52c9f69da5d4015c.
---
 icechunk/src/change_set.rs | 99 +++++++++-----------------------------
 1 file changed, 23 insertions(+), 76 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index 2fa233f64..edd8dc9ae 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -31,9 +31,6 @@ pub struct ChangeSet {
     updated_arrays: HashMap<NodeId, ArrayData>,
     updated_groups: HashMap<NodeId, Bytes>,
     set_chunks: BTreeMap<NodeId, HashMap<ManifestExtents, SplitManifest>>,
-    // we track deleted chunk indices separately to handle the case when resizing
-    // an array changes the splits, and we might lose record of chunk deletes.
-    deleted_chunks: HashMap<NodeId, HashSet<ChunkIndices>>,
     deleted_groups: HashSet<(Path, NodeId)>,
     deleted_arrays: HashSet<(Path, NodeId)>,
 }
@@ -63,25 +60,12 @@ impl ChangeSet {
         &self,
     ) -> impl Iterator<Item = (&NodeId, impl Iterator<Item = &ChunkIndices>)> {
         self.set_chunks.iter().map(|(node_id, split_map)| {
-            let deletes = self
-                .deleted_chunks
-                .get(node_id)
-                .map(|x| Either::Right(x.iter()))
-                .or_else(|| Some(Either::Left(iter::empty())))
-                .unwrap();
-            (
-                node_id,
-                Either::<iter::Empty<&ChunkIndices>, _>::Right(
-                    split_map.values().flat_map(|x| x.keys()),
-                )
-                .chain(deletes),
-            )
+            (node_id, split_map.values().flat_map(|x| x.keys()))
         })
     }
 
     pub fn has_chunk_changes(&self, node: &NodeId) -> bool {
         self.set_chunks.get(node).map(|m| !m.is_empty()).unwrap_or(false)
-            || self.deleted_chunks.get(node).map(|m| !m.is_empty()).unwrap_or(false)
     }
 
     pub fn arrays_with_chunk_changes(&self) -> impl Iterator<Item = &NodeId> {
@@ -221,43 +205,21 @@ impl ChangeSet {
         data: Option<ChunkPayload>,
         splits: &ManifestSplits,
     ) {
-        // deletes must always be recorded, since the chunk might exist in the on-disk manifest
-        let deletes = self.deleted_chunks.entry(node_id.clone()).or_default();
-        let extent = splits.find(&coord);
-        match data {
-            Some(payload) => {
-                deletes.remove(&coord);
-                self.set_chunks
-                    .entry(node_id)
-                    .or_insert_with(|| {
-                        HashMap::<ManifestExtents, SplitManifest>::with_capacity(
-                            splits.len(),
-                        )
-                    })
-                    .entry(extent.expect("grabbing this extent should succeed"))
-                    // at this point, we have valid data so we must always insert the ref
-                    .or_default()
-                    .insert(coord.clone(), Some(payload));
-            }
-            None => {
-                // this implementation makes delete idempotent
-                // it allows deleting a deleted chunk by repeatedly setting None.
-                deletes.insert(coord.clone());
-                if let Some(extent) = extent {
-                    // both uses of `or_default` are a bit too clever.
-                    // I am making sure this extent is present in the map,
-                    // so that `modified_manifest_extents_iterator` picks it up
-                    self.set_chunks
-                        .entry(node_id)
-                        .or_default()
-                        .entry(extent)
-                        .and_modify(|manifest| {
-                            manifest.remove(&coord);
-                        })
-                        .or_default();
-                };
-            }
-        }
+        #[allow(clippy::expect_used)]
+        let extent = splits.find(&coord).expect("logic bug. Trying to set chunk ref but can't find the appropriate split manifest.");
+        // this implementation makes delete idempotent
+        // it allows deleting a deleted chunk by repeatedly setting None.
+        self.set_chunks
+            .entry(node_id.clone())
+            .or_insert_with(|| {
+                HashMap::<
+                    ManifestExtents,
+                    BTreeMap<ChunkIndices, Option<ChunkPayload>>,
+                >::with_capacity(splits.len())
+            })
+            .entry(extent.clone())
+            .or_default()
+            .insert(coord.clone(), data.clone());
     }
 
     pub fn get_chunk_ref(
@@ -265,16 +227,9 @@ impl ChangeSet {
         node_id: &NodeId,
         coords: &ChunkIndices,
     ) -> Option<&Option<ChunkPayload>> {
-        self.deleted_chunks.get(node_id).and_then(|deletes| {
-            if deletes.contains(coords) {
-                Some(&None)
-            } else {
-                self.set_chunks.get(node_id).and_then(|node_chunks| {
-                    find_coord(node_chunks.keys(), coords).and_then(|extent| {
-                        node_chunks.get(&extent).and_then(|s| s.get(coords))
-                    })
-                })
-            }
+        self.set_chunks.get(node_id).and_then(|node_chunks| {
+            find_coord(node_chunks.keys(), coords)
+                .and_then(|extent| node_chunks.get(&extent).and_then(|s| s.get(coords)))
         })
     }
 
@@ -303,22 +258,14 @@ impl ChangeSet {
         }
         match self.set_chunks.get(node_id) {
             None => Either::Left(iter::empty()),
-            Some(h) => {
-                let set_chunks = h
-                    .iter()
+            Some(h) => Either::Right(
+                h.iter()
                     // FIXME: review this
                     .filter(move |(manifest_extent, _)| {
                         extent.is_none() || Some(*manifest_extent) == extent.as_ref()
                     })
-                    .flat_map(|(_, manifest)| manifest.iter());
-                let deleted_chunks = match self.deleted_chunks.get(node_id) {
-                    Some(deletes) => Either::Right(
-                        deletes.iter().map(|coord| (coord, &None::<ChunkPayload>)),
-                    ),
-                    None => Either::Left(iter::empty()),
-                };
-                Either::Right(set_chunks.chain(deleted_chunks))
-            }
+                    .flat_map(|(_, manifest)| manifest.iter()),
+            ),
         }
     }
 

From d6d1a9088162fdc27714ec66f830365b05816aeb Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Wed, 11 Jun 2025 14:15:49 -0600
Subject: [PATCH 21/43] Track deleted chunks _outside_ array shape only

---
 icechunk/src/change_set.rs |  40 +++++++++++
 icechunk/src/session.rs    | 132 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 169 insertions(+), 3 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index edd8dc9ae..fb4cff0d9 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -31,6 +31,10 @@ pub struct ChangeSet {
     updated_arrays: HashMap<NodeId, ArrayData>,
     updated_groups: HashMap<NodeId, Bytes>,
     set_chunks: BTreeMap<NodeId, HashMap<ManifestExtents, SplitManifest>>,
+    // This map keeps track of any chunk deletes that are
+    // outside the domain of the current array shape. This is needed to handle
+    // the very unlikely case of multiple resizes in the same session.
+    deleted_chunks_outside_bounds: BTreeMap<NodeId, HashSet<ChunkIndices>>,
     deleted_groups: HashSet<(Path, NodeId)>,
     deleted_arrays: HashSet<(Path, NodeId)>,
 }
@@ -123,6 +127,7 @@ impl ChangeSet {
 
         // update existing splits
         if let Some(manifests) = self.set_chunks.remove(node_id) {
+            let mut new_deleted_chunks = HashSet::<ChunkIndices>::new();
             let mut new_manifests =
                 HashMap::<ManifestExtents, SplitManifest>::with_capacity(
                     new_splits.len(),
@@ -148,8 +153,31 @@ impl ChangeSet {
                         .or_default()
                         .extend(extracted);
                 }
+                new_deleted_chunks.extend(
+                    chunks.into_iter().filter_map(|(coord, payload)| {
+                        payload.is_none().then_some(coord)
+                    }),
+                );
             }
+
+            // bring back any previously tracked deletes
+            if let Some(deletes) = self.deleted_chunks_outside_bounds.get(node_id) {
+                for coord in deletes.iter() {
+                    if let Some(extents) = new_splits.find(coord) {
+                        new_manifests
+                            .entry(extents)
+                            .or_default()
+                            .insert(coord.clone(), None);
+                    };
+                }
+            };
             self.set_chunks.insert(node_id.clone(), new_manifests);
+
+            // keep track of any deletes not inserted in to set_chunks
+            self.deleted_chunks_outside_bounds
+                .entry(node_id.clone())
+                .or_default()
+                .extend(new_deleted_chunks);
         }
     }
 
@@ -247,6 +275,16 @@ impl ChangeSet {
         }
     }
 
+    pub fn deleted_chunks_iterator(
+        &self,
+        node_id: &NodeId,
+    ) -> impl Iterator<Item = &ChunkIndices> {
+        match self.deleted_chunks_outside_bounds.get(node_id) {
+            Some(deletes) => Either::Right(deletes.iter()),
+            None => Either::Left(iter::empty()),
+        }
+    }
+
     pub fn array_chunks_iterator(
         &self,
         node_id: &NodeId,
@@ -343,6 +381,8 @@ impl ChangeSet {
         self.updated_arrays.extend(other.updated_arrays);
         self.deleted_groups.extend(other.deleted_groups);
         self.deleted_arrays.extend(other.deleted_arrays);
+        // FIXME: do we even test this?
+        self.deleted_chunks_outside_bounds.extend(other.deleted_chunks_outside_bounds);
 
         for (node, other_splits) in other.set_chunks {
             let manifests = self.set_chunks.entry(node).or_insert_with(|| {
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index a5cbef38a..c950e2a9e 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -637,9 +637,12 @@ impl Session {
                 message: "getting chunk reference".to_string(),
             }
             .into()),
-            NodeData::Array { manifests, .. } => {
-                // Note: at this point, coords could be invalid for the array shape
-                //       but we just let that pass.
+            NodeData::Array { shape, manifests, .. } => {
+                if !shape.valid_chunk_coord(coords) {
+                    // this chunk ref cannot exist
+                    return Ok(None);
+                }
+
                 // check the chunks modified in this session first
                 // TODO: I hate rust forces me to clone to search in a hashmap. How to do better?
                 let session_chunk =
@@ -1254,6 +1257,9 @@ async fn verified_node_chunk_iterator<'a>(
                 change_set
                     .array_chunks_iterator(&node.id, &node.path, extent.clone())
                     .map(|(idx, _)| idx)
+                    // by chaining here, we make sure we don't pull from the manifest
+                    // any chunks that were deleted prior to resizing in this session
+                    .chain(change_set.deleted_chunks_iterator(&node.id))
                     .collect(),
             );
 
@@ -2133,6 +2139,7 @@ mod tests {
 
     use crate::{
         ObjectStorage, Repository,
+        config::{ManifestConfig, ManifestSplitCondition},
         conflicts::{
             basic_solver::{BasicConflictSolver, VersionSelection},
             detector::ConflictDetector,
@@ -2587,6 +2594,125 @@ mod tests {
         Ok(())
     }
 
+    #[tokio_test]
+    async fn test_repository_with_splits_and_resizes() -> Result<(), Box<dyn Error>> {
+        let storage: Arc<dyn Storage + Send + Sync> = new_in_memory_storage().await?;
+        // let storage_settings = storage.default_settings();
+        // let asset_manager =
+        //     AssetManager::new_no_cache(Arc::clone(&storage), storage_settings.clone(), 1);
+
+        let split_sizes = Some(vec![(
+            ManifestSplitCondition::PathMatches { regex: r".*".to_string() },
+            vec![ManifestSplitDim {
+                condition: ManifestSplitDimCondition::Any,
+                num_chunks: 2,
+            }],
+        )]);
+
+        let man_config = ManifestConfig {
+            splitting: Some(ManifestSplittingConfig { split_sizes }),
+            ..ManifestConfig::default()
+        };
+
+        let repo = Repository::create(
+            Some(RepositoryConfig {
+                inline_chunk_threshold_bytes: Some(0),
+                manifest: Some(man_config),
+                ..Default::default()
+            }),
+            storage,
+            HashMap::new(),
+        )
+        .await?;
+        let mut session = repo.writable_session("main").await?;
+        session.add_group(Path::root(), Bytes::copy_from_slice(b"")).await?;
+
+        let array_path: Path = "/array".to_string().try_into().unwrap();
+        let shape = ArrayShape::new(vec![(4, 1)]).unwrap();
+        let dimension_names = Some(vec!["t".into()]);
+        let array_def = Bytes::from_static(br#"{"this":"other array"}"#);
+
+        session
+            .add_array(
+                array_path.clone(),
+                shape.clone(),
+                dimension_names.clone(),
+                array_def.clone(),
+            )
+            .await?;
+
+        let bytes = Bytes::copy_from_slice(&42i8.to_be_bytes());
+        for idx in vec![0, 2] {
+            let payload = session.get_chunk_writer()(bytes.clone()).await?;
+            session
+                .set_chunk_ref(array_path.clone(), ChunkIndices(vec![idx]), Some(payload))
+                .await?;
+        }
+        session.commit("None", None).await?;
+
+        let mut session = repo.writable_session("main").await?;
+        // This is how Zarr resizes
+        // first, delete any out of bounds chunks
+        session.set_chunk_ref(array_path.clone(), ChunkIndices(vec![2]), None).await?;
+        // second, update metadata
+        let shape2 = ArrayShape::new(vec![(2, 1)]).unwrap();
+        session
+            .update_array(
+                &array_path,
+                shape2.clone(),
+                dimension_names.clone(),
+                array_def.clone(),
+            )
+            .await?;
+
+        assert!(
+            session.get_chunk_ref(&array_path, &ChunkIndices(vec![2])).await?.is_none()
+        );
+
+        // resize back to original shape
+        session
+            .update_array(
+                &array_path,
+                shape.clone(),
+                dimension_names.clone(),
+                array_def.clone(),
+            )
+            .await?;
+
+        // should still be deleted
+        assert!(
+            session.get_chunk_ref(&array_path, &ChunkIndices(vec![2])).await?.is_none()
+        );
+
+        // set another chunk in this split
+        let payload = session.get_chunk_writer()(bytes.clone()).await?;
+        session
+            .set_chunk_ref(array_path.clone(), ChunkIndices(vec![3]), Some(payload))
+            .await?;
+        // should still be deleted
+        assert!(
+            session.get_chunk_ref(&array_path, &ChunkIndices(vec![2])).await?.is_none()
+        );
+        // new ref should be present
+        assert!(
+            session.get_chunk_ref(&array_path, &ChunkIndices(vec![3])).await?.is_some()
+        );
+
+        // write manifests, check number of references in manifest
+        session.commit("updated", None).await?;
+
+        // should still be deleted
+        assert!(
+            session.get_chunk_ref(&array_path, &ChunkIndices(vec![2])).await?.is_none()
+        );
+        // new ref should be present
+        assert!(
+            session.get_chunk_ref(&array_path, &ChunkIndices(vec![3])).await?.is_some()
+        );
+
+        Ok(())
+    }
+
     #[tokio_test]
     async fn test_repository_with_updates() -> Result<(), Box<dyn Error>> {
         let storage: Arc<dyn Storage + Send + Sync> = new_in_memory_storage().await?;

From 9b9712e321b8a4c8ab039cd684f5b40ad504ef6e Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Wed, 11 Jun 2025 14:56:26 -0600
Subject: [PATCH 22/43] Update stateful tests

---
 .../tests/test_stateful_repo_ops.py           |  2 +-
 .../tests/test_zarr/test_stateful.py          | 23 +++++++++++++++----
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/icechunk-python/tests/test_stateful_repo_ops.py b/icechunk-python/tests/test_stateful_repo_ops.py
index dfdc6a9f9..47dbd4809 100644
--- a/icechunk-python/tests/test_stateful_repo_ops.py
+++ b/icechunk-python/tests/test_stateful_repo_ops.py
@@ -570,7 +570,7 @@ def check_list_prefix_from_root(self) -> None:
             # need to load to dict to compare since ordering of entries might differ
             expected = json.loads(self.model[k].to_bytes())
             value = self.sync_store.get(k, default_buffer_prototype())
-            assert value is not None
+            assert value is not None, k
             actual = json.loads(value.to_bytes())
             actual_fv = actual.pop("fill_value")
             expected_fv = expected.pop("fill_value")
diff --git a/icechunk-python/tests/test_zarr/test_stateful.py b/icechunk-python/tests/test_zarr/test_stateful.py
index dc63d8422..daa6e8555 100644
--- a/icechunk-python/tests/test_zarr/test_stateful.py
+++ b/icechunk-python/tests/test_zarr/test_stateful.py
@@ -1,6 +1,7 @@
 import json
 from typing import Any
 
+import hypothesis.extra.numpy as npst
 import hypothesis.strategies as st
 import numpy as np
 import pytest
@@ -14,13 +15,11 @@
 )
 
 import zarr
-import logging
 from icechunk import Repository, in_memory_storage
 from zarr.core.buffer import default_buffer_prototype
-import hypothesis.extra.numpy as npst
-from zarr.storage import LoggingStore
 from zarr.testing.stateful import ZarrHierarchyStateMachine
 from zarr.testing.strategies import (
+    basic_indices,
     node_names,
     np_array_and_chunks,
     numpy_arrays,
@@ -47,7 +46,7 @@ def chunk_paths(
 class ModifiedZarrHierarchyStateMachine(ZarrHierarchyStateMachine):
     def __init__(self, repo: Repository) -> None:
         self.repo = repo
-        store = LoggingStore(repo.writable_session("main").store, log_level=logging.INFO + 1)
+        store = repo.writable_session("main").store
         super().__init__(store)
 
     @precondition(lambda self: self.store._store.session.has_uncommitted_changes)
@@ -62,7 +61,7 @@ def commit_with_check(self, data) -> None:
 
         self.store._store.session.commit("foo")
 
-        self.store = LoggingStore(self.repo.writable_session("main").store)
+        self.store = self.repo.writable_session("main").store
 
         lsafter = sorted(self._sync_iter(self.store.list_prefix("")))
         get_after = self._sync(self.store.get(path, prototype=PROTOTYPE))
@@ -185,6 +184,20 @@ def delete_chunk(self, data) -> None:
         self._sync(self.model.delete(path))
         self._sync(self.store.delete(path))
 
+    @precondition(lambda self: bool(self.all_arrays))
+    @rule(data=st.data())
+    def overwrite_array_basic_indexing(self, data) -> None:
+        array = data.draw(st.sampled_from(sorted(self.all_arrays)))
+        model_array = zarr.open_array(path=array, store=self.model)
+        store_array = zarr.open_array(path=array, store=self.store)
+        slicer = data.draw(basic_indices(shape=model_array.shape))
+        note(f"overwriting array basic {slicer=}")
+        new_data = data.draw(
+            npst.arrays(shape=model_array[slicer].shape, dtype=model_array.dtype)
+        )
+        model_array[slicer] = new_data
+        store_array[slicer] = new_data
+
     @precondition(lambda self: bool(self.all_arrays))
     @rule(data=st.data())
     def resize_array(self, data) -> None:

From eaee66a9fdd6b479f17a44e5f4dcc3111ffb0e3f Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Wed, 11 Jun 2025 16:21:51 -0600
Subject: [PATCH 23/43] Fix bug & update tests.

Closes #604
---
 icechunk/src/change_set.rs |  4 ++
 icechunk/src/repository.rs | 83 +++++++++++++++++++++++++++++++++++---
 icechunk/src/session.rs    |  9 ++---
 3 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index fb4cff0d9..a8061e7dc 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -68,6 +68,10 @@ impl ChangeSet {
         })
     }
 
+    pub fn is_updated_array(&self, node: &NodeId) -> bool {
+        self.updated_arrays.contains_key(node)
+    }
+
     pub fn has_chunk_changes(&self, node: &NodeId) -> bool {
         self.set_chunks.get(node).map(|m| !m.is_empty()).unwrap_or(false)
     }
diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs
index d778dc9cf..8dc2262f6 100644
--- a/icechunk/src/repository.rs
+++ b/icechunk/src/repository.rs
@@ -935,6 +935,7 @@ pub async fn raise_if_invalid_snapshot_id(
 mod tests {
     use std::{collections::HashMap, error::Error, path::PathBuf, sync::Arc};
 
+    use icechunk_macros::tokio_test;
     use storage::logging::LoggingStorage;
     use tempfile::TempDir;
 
@@ -1187,6 +1188,80 @@ mod tests {
         Ok(repository)
     }
 
+    #[tokio_test]
+    async fn test_resize_rewrites_manifests() -> Result<(), Box<dyn Error>> {
+        let storage: Arc<dyn Storage + Send + Sync> = new_in_memory_storage().await?;
+        let repo = Repository::create(
+            Some(RepositoryConfig {
+                inline_chunk_threshold_bytes: Some(0),
+                ..Default::default()
+            }),
+            Arc::clone(&storage),
+            HashMap::new(),
+        )
+        .await?;
+        let mut session = repo.writable_session("main").await?;
+        session.add_group(Path::root(), Bytes::copy_from_slice(b"")).await?;
+
+        let array_path: Path = "/array".to_string().try_into().unwrap();
+        let shape = ArrayShape::new(vec![(4, 1)]).unwrap();
+        let dimension_names = Some(vec!["t".into()]);
+        let array_def = Bytes::from_static(br#"{"this":"other array"}"#);
+
+        session
+            .add_array(
+                array_path.clone(),
+                shape.clone(),
+                dimension_names.clone(),
+                array_def.clone(),
+            )
+            .await?;
+
+        let bytes = Bytes::copy_from_slice(&42i8.to_be_bytes());
+        for idx in 0..4 {
+            let payload = session.get_chunk_writer()(bytes.clone()).await?;
+            session
+                .set_chunk_ref(array_path.clone(), ChunkIndices(vec![idx]), Some(payload))
+                .await?;
+        }
+        session.commit("first commit", None).await?;
+        assert_manifest_count(&storage, 1).await;
+
+        // Important we are not issuing any chunk deletes here (which is what Zarr does)
+        // Note we are still rewriting the manifest
+        let mut session = repo.writable_session("main").await?;
+        let shape2 = ArrayShape::new(vec![(2, 1)]).unwrap();
+        session
+            .update_array(
+                &array_path,
+                shape2.clone(),
+                dimension_names.clone(),
+                array_def.clone(),
+            )
+            .await?;
+        session.commit("second commit", None).await?;
+        assert_manifest_count(&storage, 2).await;
+
+        // Now we expand the size, but don't write chunks.
+        // No new manifests need to be written
+        let mut session = repo.writable_session("main").await?;
+        let shape3 = ArrayShape::new(vec![(6, 1)]).unwrap();
+        session
+            .update_array(
+                &array_path,
+                shape3.clone(),
+                dimension_names.clone(),
+                array_def.clone(),
+            )
+            .await?;
+        session.commit("second commit", None).await?;
+        assert_manifest_count(&storage, 2).await;
+
+        // FIXME: add more complex splitting test
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn tests_manifest_splitting_simple() -> Result<(), Box<dyn Error>> {
         let dim_size = 25u32;
@@ -1245,7 +1320,7 @@ mod tests {
             )
             .await?;
         session.commit("last split", None).await?;
-        total_manifests += 2; // FIXME: this should be +1 once writes are optimized
+        total_manifests += 1;
         assert_manifest_count(&storage, total_manifests).await;
 
         // check that reads are optimized; we should only fetch the last split for this query
@@ -1470,7 +1545,6 @@ mod tests {
         let ops = logging.fetch_operations();
         assert!(ops.is_empty());
 
-        let mut add = 0;
         for ax in 0..shape.len() {
             let mut session = repository.writable_session("main").await?;
             let axis_size = shape.get(ax).unwrap().array_length();
@@ -1486,9 +1560,8 @@ mod tests {
                     .await?
             }
 
-            add += (axis_size as u32).div_ceil(expected_split_sizes[ax]) as usize
-                - 1 * ((ax > 0) as usize);
-            total_manifests += add;
+            total_manifests +=
+                (axis_size as u32).div_ceil(expected_split_sizes[ax]) as usize;
             session.commit(format!("finished axis {0}", ax).as_ref(), None).await?;
             assert_manifest_count(&backend, total_manifests).await;
 
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index c950e2a9e..383ed4d9d 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -1714,7 +1714,7 @@ impl<'a> FlushProcess<'a> {
                 for old_ref in manifests.iter() {
                     // Remember that the extents written to disk are the `from`:`to` ranges
                     // of populated chunks
-                    match extent.overlap_with(&old_ref.extents) {
+                    match &old_ref.extents.overlap_with(extent) {
                         Overlap::Complete => {
                             debug_assert!(on_disk_bbox.is_some());
                             // Just propagate this ref again, no rewriting necessary
@@ -1877,7 +1877,9 @@ async fn flush(
             continue;
         }
 
-        if flush_data.change_set.has_chunk_changes(node_id) {
+        if flush_data.change_set.is_updated_array(node_id)
+            || flush_data.change_set.has_chunk_changes(node_id)
+        {
             trace!(path=%node.path, "Node has changes, writing a new manifest");
             // Array wasn't deleted and has changes in this session
             // get the new node to handle changes in size, e.g. appends.
@@ -2597,9 +2599,6 @@ mod tests {
     #[tokio_test]
     async fn test_repository_with_splits_and_resizes() -> Result<(), Box<dyn Error>> {
         let storage: Arc<dyn Storage + Send + Sync> = new_in_memory_storage().await?;
-        // let storage_settings = storage.default_settings();
-        // let asset_manager =
-        //     AssetManager::new_no_cache(Arc::clone(&storage), storage_settings.clone(), 1);
 
         let split_sizes = Some(vec![(
             ManifestSplitCondition::PathMatches { regex: r".*".to_string() },

From 5f4e93a08980ca61800eb8cadf355c91eb8f0ceb Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Wed, 11 Jun 2025 21:49:48 -0600
Subject: [PATCH 24/43] Stricter GC test

---
 icechunk/tests/test_gc.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/icechunk/tests/test_gc.rs b/icechunk/tests/test_gc.rs
index 7e44f3a4c..82da26a64 100644
--- a/icechunk/tests/test_gc.rs
+++ b/icechunk/tests/test_gc.rs
@@ -108,6 +108,7 @@ pub async fn do_test_gc(
 
     let first_snap_id = ds.commit("first", None).await?;
     assert_eq!(storage.list_chunks(&storage_settings).await?.count().await, 1100);
+    assert_eq!(storage.list_manifests(&storage_settings).await?.count().await, 110);
 
     let mut ds = repo.writable_session("main").await?;
 
@@ -120,6 +121,7 @@ pub async fn do_test_gc(
     }
     let second_snap_id = ds.commit("second", None).await?;
     assert_eq!(storage.list_chunks(&storage_settings).await?.count().await, 1110);
+    assert_eq!(storage.list_manifests(&storage_settings).await?.count().await, 111);
 
     // verify doing gc without dangling objects doesn't change the repo
     let now = Utc::now();
@@ -155,6 +157,7 @@ pub async fn do_test_gc(
 
     // we still have all the chunks
     assert_eq!(storage.list_chunks(&storage_settings).await?.count().await, 1110);
+    assert_eq!(storage.list_manifests(&storage_settings).await?.count().await, 111);
 
     let summary = garbage_collect(
         storage.as_ref(),
@@ -164,7 +167,7 @@ pub async fn do_test_gc(
     )
     .await?;
     assert_eq!(summary.chunks_deleted, 10);
-    assert_eq!(summary.manifests_deleted, 110);
+    assert_eq!(summary.manifests_deleted, 1);
     assert_eq!(summary.snapshots_deleted, 1);
     assert!(summary.bytes_deleted > summary.chunks_deleted);
 

From a15a8acd31bdd97934df6f5a232ee1b631837804 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Thu, 12 Jun 2025 10:03:48 -0600
Subject: [PATCH 25/43] Fix stateful test

---
 icechunk-python/tests/test_zarr/test_stateful.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/icechunk-python/tests/test_zarr/test_stateful.py b/icechunk-python/tests/test_zarr/test_stateful.py
index daa6e8555..17e1c7f85 100644
--- a/icechunk-python/tests/test_zarr/test_stateful.py
+++ b/icechunk-python/tests/test_zarr/test_stateful.py
@@ -49,7 +49,7 @@ def __init__(self, repo: Repository) -> None:
         store = repo.writable_session("main").store
         super().__init__(store)
 
-    @precondition(lambda self: self.store._store.session.has_uncommitted_changes)
+    @precondition(lambda self: self.store.session.has_uncommitted_changes)
     @rule(data=st.data())
     def commit_with_check(self, data) -> None:
         note("committing and checking list_prefix")
@@ -59,7 +59,7 @@ def commit_with_check(self, data) -> None:
         get_before = self._sync(self.store.get(path, prototype=PROTOTYPE))
         assert get_before
 
-        self.store._store.session.commit("foo")
+        self.store.session.commit("foo")
 
         self.store = self.repo.writable_session("main").store
 

From 0fc8c2a7457ca5463bafc4693def2211366edca6 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Thu, 12 Jun 2025 14:58:34 -0600
Subject: [PATCH 26/43] Cleanup

---
 icechunk/src/change_set.rs |  3 +--
 icechunk/src/session.rs    | 10 ++++------
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index a8061e7dc..861df90fa 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -302,9 +302,8 @@ impl ChangeSet {
             None => Either::Left(iter::empty()),
             Some(h) => Either::Right(
                 h.iter()
-                    // FIXME: review this
                     .filter(move |(manifest_extent, _)| {
-                        extent.is_none() || Some(*manifest_extent) == extent.as_ref()
+                        extent.as_ref().is_none_or(|e| e == *manifest_extent)
                     })
                     .flat_map(|(_, manifest)| manifest.iter()),
             ),
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 383ed4d9d..d673cb6d3 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -524,7 +524,7 @@ impl Session {
             self.cache_splits(node_id, path, shape, dimension_names);
         }
         #[allow(clippy::expect_used)]
-        self.splits.get(node_id).expect("should not be possible.")
+        self.splits.get(node_id).expect("splits for node should always exist.")
     }
 
     // Helper function that accepts a NodeSnapshot instead of a path,
@@ -540,8 +540,6 @@ impl Session {
             if shape.valid_chunk_coord(&coord) {
                 let splits = self
                     .get_splits(&node.id, &node.path, &shape, &dimension_names)
-                    // FIXME: this clone is a workaround for two mutable borrows
-                    // on self.change_set
                     .clone();
                 self.change_set.set_chunk_ref(node.id, coord, data, &splits);
                 Ok(())
@@ -1877,7 +1875,9 @@ async fn flush(
             continue;
         }
 
-        if flush_data.change_set.is_updated_array(node_id)
+        if
+        // metadata change might have shrunk the array
+        flush_data.change_set.is_updated_array(node_id)
             || flush_data.change_set.has_chunk_changes(node_id)
         {
             trace!(path=%node.path, "Node has changes, writing a new manifest");
@@ -1902,8 +1902,6 @@ async fn flush(
         } else {
             trace!(path=%node.path, "Node has no changes, keeping the previous manifest");
             // Array wasn't deleted but has no changes in this session
-            // FIXME: deal with the case of metadata shrinking an existing array, we should clear
-            // extra chunks that no longer fit in the array
             flush_data.copy_previous_manifest(&node, old_snapshot.as_ref());
         }
     }

From 2a6d65d82ef277c9e6718a25ef7876a303cd39ae Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Fri, 13 Jun 2025 12:41:27 -0600
Subject: [PATCH 27/43] More complex test + fixes

---
 icechunk/src/repository.rs | 199 +++++++++++++++++++++++++++++++------
 icechunk/src/session.rs    |  57 ++++++-----
 2 files changed, 203 insertions(+), 53 deletions(-)

diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs
index 8dc2262f6..9896425d8 100644
--- a/icechunk/src/repository.rs
+++ b/icechunk/src/repository.rs
@@ -958,18 +958,31 @@ mod tests {
 
     use super::*;
 
+    fn ravel_multi_index<'a>(index: &[u32], shape: &[u32]) -> u32 {
+        index
+            .iter()
+            .zip(shape.iter())
+            .rev()
+            .fold((0, 1), |(acc, stride), (index, size)| {
+                (acc + *index * stride, stride * *size)
+            })
+            .0
+    }
+
     async fn assert_manifest_count(
         storage: &Arc<dyn Storage + Send + Sync>,
         total_manifests: usize,
     ) {
+        let expected = storage
+            .list_manifests(&storage.default_settings())
+            .await
+            .unwrap()
+            .count()
+            .await;
         assert_eq!(
-            storage
-                .list_manifests(&storage.default_settings())
-                .await
-                .unwrap()
-                .count()
-                .await,
-            total_manifests
+            total_manifests, expected,
+            "Mismatch in manifest count: expected {}, but got {}",
+            expected, total_manifests,
         );
     }
 
@@ -1425,7 +1438,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[tokio_test]
     async fn test_manifest_splitting_complex_config() -> Result<(), Box<dyn Error>> {
         let shape = ArrayShape::new(vec![(25, 1), (10, 1), (3, 1), (4, 1)]).unwrap();
         let dimension_names = Some(vec!["t".into(), "z".into(), "y".into(), "x".into()]);
@@ -1490,9 +1503,10 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[tokio_test]
     async fn test_manifest_splitting_complex_writes() -> Result<(), Box<dyn Error>> {
         let t_split_size = 12u32;
+        let other_split_size = 9u32;
         let y_split_size = 2u32;
 
         let shape = ArrayShape::new(vec![(25, 1), (10, 1), (3, 1), (4, 1)]).unwrap();
@@ -1518,7 +1532,7 @@ mod tests {
                 ManifestSplitCondition::NameMatches { regex: r".*".to_string() },
                 vec![ManifestSplitDim {
                     condition: ManifestSplitDimCondition::Any,
-                    num_chunks: 9,
+                    num_chunks: other_split_size,
                 }],
             ),
         ];
@@ -1537,6 +1551,7 @@ mod tests {
             Some(logging_c),
         )
         .await?;
+        let repo_clone = repository.reopen(None, None)?;
 
         let mut total_manifests = 0;
         assert_manifest_count(&backend, total_manifests).await;
@@ -1545,17 +1560,63 @@ mod tests {
         let ops = logging.fetch_operations();
         assert!(ops.is_empty());
 
+        let array_shape =
+            shape.iter().map(|x| x.array_length() as u32).collect::<Vec<_>>();
+
+        let verify_data = async |ax, session: &Session| {
+            for i in 0..shape.get(ax).unwrap().array_length() {
+                let mut index = vec![0u32, 0, 0, 0];
+                index[ax] = i as u32;
+                let ic = index.clone();
+                let val = get_chunk(
+                    session
+                        .get_chunk_reader(
+                            &temp_path,
+                            &ChunkIndices(index),
+                            &ByteRange::ALL,
+                        )
+                        .await
+                        .unwrap(),
+                )
+                .await
+                .unwrap()
+                .expect(&format!("getting chunk ref failed for {:?}", &ic));
+                let expected_value =
+                    ravel_multi_index(ic.as_slice(), array_shape.as_slice());
+                let expected =
+                    Bytes::copy_from_slice(format!("{0}", expected_value).as_bytes());
+                assert_eq!(
+                    val, expected,
+                    "For chunk {:?}, received {:?}, expected {:?}",
+                    ic, val, expected
+                );
+            }
+        };
+        let verify_all_data = async |repo: &Repository| {
+            let session = repo
+                .readonly_session(&VersionInfo::BranchTipRef("main".to_string()))
+                .await
+                .unwrap();
+            for ax in 0..shape.len() {
+                verify_data(ax, &session).await;
+            }
+        };
+
+        //=========================================================
+        // This loop iterates over axis and rewrites the boundary chunks.
+        // Each loop iteration must rewrite chunk_shape/split_size manifests
         for ax in 0..shape.len() {
             let mut session = repository.writable_session("main").await?;
             let axis_size = shape.get(ax).unwrap().array_length();
             for i in 0..axis_size {
                 let mut index = vec![0u32, 0, 0, 0];
                 index[ax] = i as u32;
+                let value = ravel_multi_index(index.as_slice(), array_shape.as_slice());
                 session
                     .set_chunk_ref(
                         temp_path.clone(),
                         ChunkIndices(index),
-                        Some(ChunkPayload::Inline(format!("{0}", i).into())),
+                        Some(ChunkPayload::Inline(format!("{0}", value).into())),
                     )
                     .await?
             }
@@ -1565,25 +1626,107 @@ mod tests {
             session.commit(format!("finished axis {0}", ax).as_ref(), None).await?;
             assert_manifest_count(&backend, total_manifests).await;
 
-            for i in 0..shape.get(ax).unwrap().array_length() {
-                let mut index = vec![0u32, 0, 0, 0];
-                index[ax] = i as u32;
-                let val = get_chunk(
-                    session
-                        .get_chunk_reader(
-                            &temp_path,
-                            &ChunkIndices(index),
-                            &ByteRange::ALL,
-                        )
-                        .await
-                        .unwrap(),
+            verify_data(ax.clone(), &session).await;
+        }
+        verify_all_data(&repository).await;
+
+        //=========================================================
+        // Now change splitting config
+        let split_sizes = vec![(
+            ManifestSplitCondition::AnyArray,
+            vec![ManifestSplitDim {
+                condition: ManifestSplitDimCondition::DimensionName("t".to_string()),
+                num_chunks: t_split_size,
+            }],
+        )];
+
+        let split_config = ManifestSplittingConfig { split_sizes: Some(split_sizes) };
+        let man_config = ManifestConfig {
+            preload: Some(ManifestPreloadConfig {
+                max_total_refs: None,
+                preload_if: None,
+            }),
+            splitting: Some(split_config.clone()),
+        };
+        let config = RepositoryConfig {
+            manifest: Some(man_config),
+            ..RepositoryConfig::default()
+        };
+        let repository = repository.reopen(Some(config), None)?;
+        verify_all_data(&repository).await;
+        let mut session = repository.writable_session("main").await?;
+        let index = vec![13, 0, 0, 0];
+        let value = ravel_multi_index(index.as_slice(), array_shape.as_slice());
+        session
+            .set_chunk_ref(
+                temp_path.clone(),
+                ChunkIndices(index),
+                Some(ChunkPayload::Inline(format!("{0}", value).into())),
+            )
+            .await?;
+        // Important: we only create one new manifest in this case for
+        // the first split in the `t`-axis. Since the other splits
+        // are not modified we preserve all the old manifests
+        total_manifests += 1;
+        session.commit(format!("finished time again").as_ref(), None).await?;
+        assert_manifest_count(&backend, total_manifests).await;
+        verify_all_data(&repository).await;
+
+        // now modify all splits to trigger a full rewrite
+        let mut session = repository.writable_session("main").await?;
+        for idx in [0, 12, 24] {
+            let index = vec![idx, 0, 0, 0];
+            let value = ravel_multi_index(index.as_slice(), array_shape.as_slice());
+            session
+                .set_chunk_ref(
+                    temp_path.clone(),
+                    ChunkIndices(index),
+                    Some(ChunkPayload::Inline(format!("{0}", value).into())),
                 )
-                .await
-                .unwrap()
-                .unwrap();
-                assert_eq!(val, Bytes::copy_from_slice(format!("{0}", i).as_bytes()));
-            }
+                .await?;
         }
+        total_manifests +=
+            (shape.get(0).unwrap().array_length() as u32).div_ceil(t_split_size) as usize;
+        session.commit(format!("finished time again").as_ref(), None).await?;
+        assert_manifest_count(&backend, total_manifests).await;
+        verify_all_data(&repository).await;
+
+        //=========================================================
+        // Now get back to original repository with original config
+        // Modify one `t` split.
+        let mut session = repo_clone.writable_session("main").await?;
+        session
+            .set_chunk_ref(
+                temp_path.clone(),
+                ChunkIndices(vec![0, 0, 0, 0]),
+                Some(ChunkPayload::Inline(format!("{0}", 0).into())),
+            )
+            .await?;
+        // Important: now we rewrite one split per dimension
+        total_manifests += 4 - 1;
+        session.commit(format!("finished time again").as_ref(), None).await?;
+        assert_manifest_count(&backend, total_manifests).await;
+        verify_all_data(&repo_clone).await;
+        verify_all_data(&repository).await;
+
+        let mut session = repo_clone.writable_session("main").await?;
+        for idx in [0, 12, 24] {
+            let index = vec![idx, 0, 0, 0];
+            let value = ravel_multi_index(index.as_slice(), array_shape.as_slice());
+            session
+                .set_chunk_ref(
+                    temp_path.clone(),
+                    ChunkIndices(index),
+                    Some(ChunkPayload::Inline(format!("{0}", value).into())),
+                )
+                .await?;
+        }
+        total_manifests +=
+            (shape.get(0).unwrap().array_length() as u32).div_ceil(t_split_size) as usize;
+        session.commit(format!("finished time again").as_ref(), None).await?;
+        assert_manifest_count(&backend, total_manifests).await;
+        verify_all_data(&repo_clone).await;
+
         Ok(())
     }
 
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index d673cb6d3..5ae5b96b7 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -1622,16 +1622,6 @@ impl<'a> FlushProcess<'a> {
         }
     }
 
-    fn finalize_refs(
-        &mut self,
-        node_id: &NodeId,
-        refs: HashMap<ManifestExtents, ManifestRef>,
-    ) -> SessionResult<()> {
-        for ref_ in refs.into_values() {
-            self.manifest_refs.entry(node_id.clone()).or_default().push(ref_);
-        }
-        Ok(())
-    }
     /// Write a manifest for a node that was created in this session
     /// It doesn't need to look at previous manifests because the node is new
     async fn write_manifest_for_new_node(
@@ -1643,9 +1633,6 @@ impl<'a> FlushProcess<'a> {
         let splits =
             self.splits.get(node_id).expect("getting split for node unexpectedly failed");
 
-        let mut refs =
-            HashMap::<ManifestExtents, ManifestRef>::with_capacity(splits.len());
-
         // TODO: this could be try_fold with the refs HashMap as state
         for extent in splits.iter() {
             if self.change_set.array_manifest(node_id, extent).is_some() {
@@ -1658,12 +1645,14 @@ impl<'a> FlushProcess<'a> {
                         )
                         .map(Ok),
                 );
-                self.write_manifest_from_iterator(chunks)
-                    .await?
-                    .map(|new_ref| refs.insert(extent.clone(), new_ref));
+                #[allow(clippy::expect_used)]
+                let new_ref = self.write_manifest_from_iterator(chunks).await?.expect(
+                    "logic bug. for a new node, we must always write the manifest",
+                );
+                self.manifest_refs.entry(node_id.clone()).or_default().push(new_ref);
             }
         }
-        self.finalize_refs(node_id, refs)
+        Ok(())
     }
 
     /// Write a manifest for a node that was modified in this session
@@ -1678,9 +1667,8 @@ impl<'a> FlushProcess<'a> {
         #[allow(clippy::expect_used)]
         let splits =
             self.splits.get(&node.id).expect("splits should exist for this node.");
-        // populate with existing refs, if they are compatible
         let mut refs =
-            HashMap::<ManifestExtents, ManifestRef>::with_capacity(splits.len());
+            HashMap::<ManifestExtents, Vec<ManifestRef>>::with_capacity(splits.len());
 
         let on_disk_extents =
             manifests.iter().map(|m| m.extents.clone()).collect::<Vec<_>>();
@@ -1700,7 +1688,7 @@ impl<'a> FlushProcess<'a> {
                 // this split was modified in this session, rewrite it completely
                 self.write_manifest_for_updated_chunks(node, extent)
                     .await?
-                    .map(|new_ref| refs.insert(extent.clone(), new_ref));
+                    .map(|new_ref| refs.insert(extent.clone(), vec![new_ref]));
             } else {
                 let on_disk_bbox = on_disk_extents
                     .iter()
@@ -1712,11 +1700,11 @@ impl<'a> FlushProcess<'a> {
                 for old_ref in manifests.iter() {
                     // Remember that the extents written to disk are the `from`:`to` ranges
                     // of populated chunks
-                    match &old_ref.extents.overlap_with(extent) {
+                    match old_ref.extents.overlap_with(extent) {
                         Overlap::Complete => {
                             debug_assert!(on_disk_bbox.is_some());
                             // Just propagate this ref again, no rewriting necessary
-                            refs.insert(extent.clone(), old_ref.clone());
+                            refs.entry(extent.clone()).or_default().push(old_ref.clone());
                             // OK to unwrap here since this manifest file must exist in the old snapshot
                             #[allow(clippy::expect_used)]
                             self.manifest_files.insert(
@@ -1727,9 +1715,12 @@ impl<'a> FlushProcess<'a> {
                             // the splits have changed, but no refs in this split have been written in this session
                             // same as `if` block above
                             debug_assert!(on_disk_bbox.is_some());
-                            self.write_manifest_for_updated_chunks(node, extent)
+                            if let Some(new_ref) = self
+                                .write_manifest_for_updated_chunks(node, extent)
                                 .await?
-                                .map(|new_ref| refs.insert(extent.clone(), new_ref));
+                            {
+                                refs.entry(extent.clone()).or_default().push(new_ref);
+                            }
                         }
                         Overlap::None => {
                             // Nothing to do
@@ -1739,7 +1730,12 @@ impl<'a> FlushProcess<'a> {
             }
         }
 
-        self.finalize_refs(&node.id, refs)?;
+        // FIXME: Assert that bboxes in refs don't overlap
+
+        self.manifest_refs
+            .entry(node.id.clone())
+            .or_default()
+            .extend(refs.into_values().flatten());
         Ok(())
     }
 
@@ -1913,6 +1909,17 @@ async fn flush(
         flush_data.write_manifest_for_new_node(node_id, node_path).await?;
     }
 
+    // manifest_files & manifest_refs _must_ be consistent
+    debug_assert_eq!(
+        flush_data.manifest_files.iter().map(|x| x.id.clone()).collect::<HashSet<_>>(),
+        flush_data
+            .manifest_refs
+            .values()
+            .flatten()
+            .map(|x| x.object_id.clone())
+            .collect::<HashSet<_>>(),
+    );
+
     trace!("Building new snapshot");
     // gather and sort nodes:
     // this is a requirement for Snapshot::from_iter

From 27ad4bc4e8008b6243c9ed603cfbf0b8d93a0c81 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Fri, 13 Jun 2025 15:09:30 -0600
Subject: [PATCH 28/43] more test

---
 icechunk/src/repository.rs | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs
index 9896425d8..4b43455fd 100644
--- a/icechunk/src/repository.rs
+++ b/icechunk/src/repository.rs
@@ -1703,7 +1703,7 @@ mod tests {
             )
             .await?;
         // Important: now we rewrite one split per dimension
-        total_manifests += 4 - 1;
+        total_manifests += 3;
         session.commit(format!("finished time again").as_ref(), None).await?;
         assert_manifest_count(&backend, total_manifests).await;
         verify_all_data(&repo_clone).await;
@@ -1727,6 +1727,39 @@ mod tests {
         assert_manifest_count(&backend, total_manifests).await;
         verify_all_data(&repo_clone).await;
 
+        // do that again, but with different values and test those specifically
+        let mut session = repo_clone.writable_session("main").await?;
+        for idx in [0, 12, 24] {
+            let index = vec![idx, 0, 0, 0];
+            session
+                .set_chunk_ref(
+                    temp_path.clone(),
+                    ChunkIndices(index),
+                    Some(ChunkPayload::Inline(format!("{0}", idx + 2).into())),
+                )
+                .await?;
+        }
+        total_manifests +=
+            (shape.get(0).unwrap().array_length() as u32).div_ceil(t_split_size) as usize;
+        session.commit(format!("finished time again").as_ref(), None).await?;
+        assert_manifest_count(&backend, total_manifests).await;
+        for idx in [0, 12, 24] {
+            let actual = get_chunk(
+                session
+                    .get_chunk_reader(
+                        &temp_path,
+                        &ChunkIndices(vec![idx.clone(), 0, 0, 0]),
+                        &ByteRange::ALL,
+                    )
+                    .await
+                    .unwrap(),
+            )
+            .await
+            .unwrap()
+            .unwrap();
+            let expected = Bytes::copy_from_slice(format!("{0}", idx + 2).as_bytes());
+            assert_eq!(actual,expected);
+        }
         Ok(())
     }
 

From 14d70df4f358f5f681548935fd7e4369ce0b47bb Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Fri, 13 Jun 2025 16:39:20 -0600
Subject: [PATCH 29/43] Fix merging

---
 icechunk/src/change_set.rs      |   8 +-
 icechunk/src/format/manifest.rs |  12 +++
 icechunk/src/repository.rs      | 155 +++++++++++++++++++++++++++++---
 icechunk/src/session.rs         |  11 +++
 4 files changed, 168 insertions(+), 18 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index 861df90fa..085f4f542 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -387,16 +387,16 @@ impl ChangeSet {
         // FIXME: do we even test this?
         self.deleted_chunks_outside_bounds.extend(other.deleted_chunks_outside_bounds);
 
-        for (node, other_splits) in other.set_chunks {
+        other.set_chunks.into_iter().for_each(|(node, other_splits)| {
             let manifests = self.set_chunks.entry(node).or_insert_with(|| {
                 HashMap::<ManifestExtents, SplitManifest>::with_capacity(
                     other_splits.len(),
                 )
             });
-            for (extent, their_manifest) in other_splits {
+            other_splits.into_iter().for_each(|(extent, their_manifest)| {
                 manifests.entry(extent).or_default().extend(their_manifest)
-            }
-        }
+            })
+        });
     }
 
     pub fn merge_many<T: IntoIterator<Item = ChangeSet>>(&mut self, others: T) {
diff --git a/icechunk/src/format/manifest.rs b/icechunk/src/format/manifest.rs
index 9e0ae5b55..446a5b6cc 100644
--- a/icechunk/src/format/manifest.rs
+++ b/icechunk/src/format/manifest.rs
@@ -164,6 +164,18 @@ impl ManifestSplits {
     pub fn len(&self) -> usize {
         self.0.len()
     }
+
+    pub fn compatible_with(&self, other: &Self) -> bool {
+        for ours in self.iter() {
+            if any(other.iter(), |theirs| {
+                ours.overlap_with(theirs) == Overlap::Partial
+                    || theirs.overlap_with(ours) == Overlap::Partial
+            }) {
+                return false;
+            }
+        }
+        true
+    }
 }
 
 /// Helper function for constructing uniformly spaced manifest split edges
diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs
index 4b43455fd..c91470469 100644
--- a/icechunk/src/repository.rs
+++ b/icechunk/src/repository.rs
@@ -936,6 +936,7 @@ mod tests {
     use std::{collections::HashMap, error::Error, path::PathBuf, sync::Arc};
 
     use icechunk_macros::tokio_test;
+    use itertools::enumerate;
     use storage::logging::LoggingStorage;
     use tempfile::TempDir;
 
@@ -1164,6 +1165,25 @@ mod tests {
         ));
     }
 
+    fn reopen_repo_with_new_splitting_config(
+        repo: &Repository,
+        split_sizes: Option<Vec<(ManifestSplitCondition, Vec<ManifestSplitDim>)>>,
+    ) -> Repository {
+        let split_config = ManifestSplittingConfig { split_sizes };
+        let man_config = ManifestConfig {
+            preload: Some(ManifestPreloadConfig {
+                max_total_refs: None,
+                preload_if: None,
+            }),
+            splitting: Some(split_config.clone()),
+        };
+        let config = RepositoryConfig {
+            manifest: Some(man_config),
+            ..RepositoryConfig::default()
+        };
+        repo.reopen(Some(config), None).unwrap()
+    }
+
     async fn create_repo_with_split_manifest_config(
         path: &Path,
         shape: &ArrayShape,
@@ -1640,19 +1660,8 @@ mod tests {
             }],
         )];
 
-        let split_config = ManifestSplittingConfig { split_sizes: Some(split_sizes) };
-        let man_config = ManifestConfig {
-            preload: Some(ManifestPreloadConfig {
-                max_total_refs: None,
-                preload_if: None,
-            }),
-            splitting: Some(split_config.clone()),
-        };
-        let config = RepositoryConfig {
-            manifest: Some(man_config),
-            ..RepositoryConfig::default()
-        };
-        let repository = repository.reopen(Some(config), None)?;
+        let repository =
+            reopen_repo_with_new_splitting_config(&repository, Some(split_sizes));
         verify_all_data(&repository).await;
         let mut session = repository.writable_session("main").await?;
         let index = vec![13, 0, 0, 0];
@@ -1758,7 +1767,125 @@ mod tests {
             .unwrap()
             .unwrap();
             let expected = Bytes::copy_from_slice(format!("{0}", idx + 2).as_bytes());
-            assert_eq!(actual,expected);
+            assert_eq!(actual, expected);
+        }
+        Ok(())
+    }
+
+    #[tokio_test]
+    async fn test_manifest_splits_merge_sessions() -> Result<(), Box<dyn Error>> {
+        let shape = ArrayShape::new(vec![(25, 1), (10, 1), (3, 1), (4, 1)]).unwrap();
+        let dimension_names = Some(vec!["t".into(), "z".into(), "y".into(), "x".into()]);
+        let temp_path: Path = "/temperature".try_into().unwrap();
+
+        let orig_split_sizes = vec![(
+            ManifestSplitCondition::AnyArray,
+            vec![ManifestSplitDim {
+                condition: ManifestSplitDimCondition::DimensionName("t".to_string()),
+                num_chunks: 12u32,
+            }],
+        )];
+        let split_config =
+            ManifestSplittingConfig { split_sizes: Some(orig_split_sizes.clone()) };
+        let backend: Arc<dyn Storage + Send + Sync> = new_in_memory_storage().await?;
+        let repository = create_repo_with_split_manifest_config(
+            &temp_path,
+            &shape,
+            &dimension_names,
+            &split_config,
+            Some(backend),
+        )
+        .await?;
+
+        let indices =
+            vec![vec![0, 0, 1, 0], vec![0, 0, 0, 0], vec![0, 2, 0, 0], vec![0, 2, 0, 1]];
+
+        let mut session1 = repository.writable_session("main").await?;
+        session1
+            .set_chunk_ref(
+                temp_path.clone(),
+                ChunkIndices(indices[0].clone()),
+                Some(ChunkPayload::Inline(format!("{0}", 0).into())),
+            )
+            .await?;
+        session1
+            .set_chunk_ref(
+                temp_path.clone(),
+                ChunkIndices(indices[1].clone()),
+                Some(ChunkPayload::Inline(format!("{0}", 1).into())),
+            )
+            .await?;
+
+        for incompatible_size in [1, 11u32, 24u32, u32::MAX] {
+            let incompatible_split_sizes = vec![(
+                ManifestSplitCondition::AnyArray,
+                vec![ManifestSplitDim {
+                    condition: ManifestSplitDimCondition::DimensionName("t".to_string()),
+                    num_chunks: incompatible_size,
+                }],
+            )];
+            let other_repo = reopen_repo_with_new_splitting_config(
+                &repository,
+                Some(incompatible_split_sizes),
+            );
+
+            assert_ne!(other_repo.config(), repository.config());
+
+            let mut session2 = other_repo.writable_session("main").await?;
+            session2
+                .set_chunk_ref(
+                    temp_path.clone(),
+                    ChunkIndices(indices[2].clone()),
+                    Some(ChunkPayload::Inline(format!("{0}", 2).into())),
+                )
+                .await?;
+            session2
+                .set_chunk_ref(
+                    temp_path.clone(),
+                    ChunkIndices(indices[3].clone()),
+                    Some(ChunkPayload::Inline(format!("{0}", 3).into())),
+                )
+                .await?;
+
+            assert!(session1.merge(session2).await.is_err());
+        }
+
+        // now with the same split sizes
+        let other_repo =
+            reopen_repo_with_new_splitting_config(&repository, Some(orig_split_sizes));
+        let mut session2 = other_repo.writable_session("main").await?;
+        session2
+            .set_chunk_ref(
+                temp_path.clone(),
+                ChunkIndices(indices[2].clone()),
+                Some(ChunkPayload::Inline(format!("{0}", 2).into())),
+            )
+            .await?;
+        session2
+            .set_chunk_ref(
+                temp_path.clone(),
+                ChunkIndices(indices[3].clone()),
+                Some(ChunkPayload::Inline(format!("{0}", 3).into())),
+            )
+            .await?;
+
+        session1.merge(session2).await?;
+        for (val, idx) in enumerate(indices.iter()) {
+            let actual = get_chunk(
+                session1
+                    .get_chunk_reader(
+                        &temp_path,
+                        &ChunkIndices(idx.clone()),
+                        &ByteRange::ALL,
+                    )
+                    .await
+                    .unwrap(),
+            )
+            .await
+            .unwrap()
+            .expect(&format!("getting chunk ref failed for {:?}", &idx));
+            let expected = Bytes::copy_from_slice(format!("{0}", val).as_bytes());
+            assert_eq!(actual, expected);
         }
         Ok(())
     }
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 5ae5b96b7..e56210fb8 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -109,6 +109,8 @@ pub enum SessionErrorKind {
     InvalidIndex { coords: ChunkIndices, path: Path },
     #[error("invalid chunk index for splitting manifests: {coords:?}")]
     InvalidIndexForSplitManifests { coords: ChunkIndices },
+    #[error("incompatible manifest splitting config when merging sessions")]
+    IncompatibleSplits,
     #[error("`to` snapshot ancestry doesn't include `from`")]
     BadSnapshotChainForDiff,
 }
@@ -904,6 +906,15 @@ impl Session {
             return Err(SessionErrorKind::ReadOnlySession.into());
         }
         let Session { splits, change_set, .. } = other;
+
+        if self.splits.iter().any(|(node, our_splits)| {
+            splits
+                .get(node)
+                .is_some_and(|their_splits| !our_splits.compatible_with(their_splits))
+        }) {
+            return Err(SessionErrorKind::IncompatibleSplits.into());
+        }
+
         self.splits.extend(splits);
         self.change_set.merge(change_set);
         Ok(())

From 27c7c4b9c21ff1210f2df487cadb3e338ad2d6ea Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Fri, 13 Jun 2025 17:01:31 -0600
Subject: [PATCH 30/43] reorg tests

---
 icechunk/src/format/manifest.rs | 253 ++++++++++++++++++++++++++++----
 icechunk/src/session.rs         | 186 +----------------------
 2 files changed, 223 insertions(+), 216 deletions(-)

diff --git a/icechunk/src/format/manifest.rs b/icechunk/src/format/manifest.rs
index 446a5b6cc..eea131b77 100644
--- a/icechunk/src/format/manifest.rs
+++ b/icechunk/src/format/manifest.rs
@@ -594,43 +594,232 @@ static ROOT_OPTIONS: VerifierOptions = VerifierOptions {
 #[allow(clippy::unwrap_used, clippy::panic)]
 mod tests {
     use super::*;
-    use crate::strategies::{ShapeDim, shapes_and_dims};
+    use crate::strategies::{ShapeDim, manifest_extents, shapes_and_dims};
     use icechunk_macros;
+    use itertools::{all, multizip};
+    use proptest::collection::vec;
     use proptest::prelude::*;
+    use std::error::Error;
+    use test_strategy::proptest;
+
+    #[proptest]
+    fn test_property_extents_set_ops_same(
+        #[strategy(manifest_extents(4))] e: ManifestExtents,
+    ) {
+        prop_assert_eq!(e.intersection(&e), Some(e.clone()));
+        prop_assert_eq!(e.union(&e), e.clone());
+        prop_assert_eq!(e.overlap_with(&e), Overlap::Complete);
+    }
+
+    #[proptest]
+    fn test_property_extents_set_ops(
+        #[strategy(manifest_extents(4))] e1: ManifestExtents,
+        #[strategy(manifest_extents(4))] e2: ManifestExtents,
+    ) {
+        let union = e1.union(&e2);
+        let intersection = e1.intersection(&e2);
+
+        prop_assert_eq!(e1.intersection(&union), Some(e1.clone()));
+        prop_assert_eq!(union.intersection(&e1), Some(e1.clone()));
+        prop_assert_eq!(e2.intersection(&union), Some(e2.clone()));
+        prop_assert_eq!(union.intersection(&e2), Some(e2.clone()));
+
+        // order is important for the next 2
+        prop_assert_eq!(e1.overlap_with(&union), Overlap::Complete);
+        prop_assert_eq!(e2.overlap_with(&union), Overlap::Complete);
+
+        if intersection.is_some() {
+            let int = intersection.unwrap();
+            let expected = if e1 == e1 { Overlap::Complete } else { Overlap::Partial };
+            prop_assert_eq!(int.overlap_with(&e1), expected.clone());
+            prop_assert_eq!(int.overlap_with(&e2), expected);
+        } else {
+            prop_assert_eq!(e2.overlap_with(&e1), Overlap::None);
+            prop_assert_eq!(e1.overlap_with(&e2), Overlap::None);
+        }
+    }
 
-    proptest! {
-        #[icechunk_macros::test]
-        fn test_manifest_split_from_edges(shape_dim in shapes_and_dims(Some(5))) {
-            // Note: using the shape, chunks strategy to generate chunk_shape, split_shape
-            let ShapeDim { shape, .. } = shape_dim;
-
-            let num_chunks = shape.iter().map(|x| x.array_length()).collect::<Vec<_>>();
-            let split_shape = shape.iter().map(|x| x.chunk_length()).collect::<Vec<_>>();
-
-            let ndim = shape.len();
-            let edges: Vec<Vec<u32>> =
-                (0usize..ndim).map(|axis| {
-                    uniform_manifest_split_edges(num_chunks[axis] as u32, &(split_shape[axis] as u32))
-                }
-                ).collect();
-
-            let splits = ManifestSplits::from_edges(edges.into_iter());
-            for edge in splits.iter() {
-                // must be ndim ranges
-                prop_assert_eq!(edge.len(), ndim);
-                for range in edge.iter() {
-                    prop_assert!(range.end > range.start);
-                }
-            }
+    #[proptest]
+    fn test_property_extents_widths(
+        #[strategy(manifest_extents(4))] extent1: ManifestExtents,
+        #[strategy(vec(0..100, 4))] delta_left: Vec<i32>,
+        #[strategy(vec(0..100, 4))] delta_right: Vec<i32>,
+    ) {
+        let widths = extent1.iter().map(|r| (r.end - r.start) as i32).collect::<Vec<_>>();
+        let extent2 = ManifestExtents::from_ranges_iter(
+            multizip((extent1.iter(), delta_left.iter(), delta_right.iter())).map(
+                |(extent, dleft, dright)| {
+                    ((extent.start as i32 + dleft) as u32)
+                        ..((extent.end as i32 + dright) as u32)
+                },
+            ),
+        );
+
+        if all(delta_left.iter(), |elem| elem == &0i32)
+            && all(delta_right.iter(), |elem| elem == &0i32)
+        {
+            prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::Complete);
+        }
+
+        let extent2 = ManifestExtents::from_ranges_iter(
+            multizip((
+                extent1.iter(),
+                widths.iter(),
+                delta_left.iter(),
+                delta_right.iter(),
+            ))
+            .map(|(extent, width, dleft, dright)| {
+                let (low, high) = (dleft.min(dright), dleft.max(dright));
+                ((extent.start as i32 + width + low) as u32)
+                    ..((extent.end as i32 + width + high) as u32)
+            }),
+        );
+
+        prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::None);
+
+        let extent2 = ManifestExtents::from_ranges_iter(
+            multizip((
+                extent1.iter(),
+                widths.iter(),
+                delta_left.iter(),
+                delta_right.iter(),
+            ))
+            .map(|(extent, width, dleft, dright)| {
+                let (low, high) = (dleft.min(dright), dleft.max(dright));
+                ((extent.start as i32 - width - high).max(0i32) as u32)
+                    ..((extent.end as i32 - width - low) as u32)
+            }),
+        );
+        prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::None);
+
+        let extent2 = ManifestExtents::from_ranges_iter(
+            multizip((extent1.iter(), delta_left.iter(), delta_right.iter())).map(
+                |(extent, dleft, dright)| {
+                    ((extent.start as i32 - dleft - 1).max(0i32) as u32)
+                        ..((extent.end as i32 + dright + 1) as u32)
+                },
+            ),
+        );
+        prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::Partial);
+    }
+
+    #[icechunk_macros::test]
+    fn test_overlaps() -> Result<(), Box<dyn Error>> {
+        let e1 = ManifestExtents::new(
+            vec![0u32, 1, 2].as_slice(),
+            vec![2u32, 4, 6].as_slice(),
+        );
+
+        let e2 = ManifestExtents::new(
+            vec![10u32, 1, 2].as_slice(),
+            vec![12u32, 4, 6].as_slice(),
+        );
+
+        let union = ManifestExtents::new(
+            vec![0u32, 1, 2].as_slice(),
+            vec![12u32, 4, 6].as_slice(),
+        );
+
+        assert_eq!(e2.overlap_with(&e1), Overlap::None);
+        assert_eq!(e1.intersection(&e2), None);
+        assert_eq!(e1.union(&e2), union);
 
-            // when using from_edges, extents must not exactly overlap
-            for edges in splits.iter().combinations(2) {
-                let is_equal = std::iter::zip(edges[0].iter(), edges[1].iter())
-                    .all(|(range1, range2)| {
-                        (range1.start == range2.start) && (range1.end == range2.end)
-                    });
-                prop_assert!(!is_equal);
+        let e1 = ManifestExtents::new(
+            vec![0u32, 1, 2].as_slice(),
+            vec![2u32, 4, 6].as_slice(),
+        );
+        let e2 = ManifestExtents::new(
+            vec![2u32, 1, 2].as_slice(),
+            vec![42u32, 4, 6].as_slice(),
+        );
+        assert_eq!(e2.overlap_with(&e1), Overlap::None);
+        assert_eq!(e1.overlap_with(&e2), Overlap::None);
+
+        // asymmetric case
+        let e1 = ManifestExtents::new(
+            vec![0u32, 1, 2].as_slice(),
+            vec![3u32, 4, 6].as_slice(),
+        );
+        let e2 = ManifestExtents::new(
+            vec![2u32, 1, 2].as_slice(),
+            vec![3u32, 4, 6].as_slice(),
+        );
+        let union = ManifestExtents::new(
+            vec![0u32, 1, 2].as_slice(),
+            vec![3u32, 4, 6].as_slice(),
+        );
+        let intersection = ManifestExtents::new(
+            vec![2u32, 1, 2].as_slice(),
+            vec![3u32, 4, 6].as_slice(),
+        );
+        assert_eq!(e2.overlap_with(&e1), Overlap::Complete);
+        assert_eq!(e1.overlap_with(&e2), Overlap::Partial);
+        assert_eq!(e1.union(&e2), union.clone());
+        assert_eq!(e2.union(&e1), union.clone());
+        assert_eq!(e1.intersection(&e2), Some(intersection));
+
+        // empty set
+        let e1 = ManifestExtents::new(
+            vec![0u32, 1, 2].as_slice(),
+            vec![3u32, 4, 6].as_slice(),
+        );
+        let e2 = ManifestExtents::new(
+            vec![2u32, 1, 2].as_slice(),
+            vec![2u32, 4, 6].as_slice(),
+        );
+        assert_eq!(e1.intersection(&e2), None);
+
+        // this should create non-overlapping extents
+        let splits = ManifestSplits::from_edges(vec![
+            vec![0, 10, 20],
+            vec![0, 1, 2],
+            vec![0, 21, 22],
+        ]);
+        for vec in splits.iter().combinations(2) {
+            assert_eq!(vec[0].overlap_with(vec[1]), Overlap::None);
+            assert_eq!(vec[1].overlap_with(vec[0]), Overlap::None);
+        }
+
+        Ok(())
+    }
+
+    #[proptest]
+    fn test_manifest_split_from_edges(
+        #[strategy(shapes_and_dims(Some(5)))] shape_dim: ShapeDim,
+    ) {
+        // Note: using the shape, chunks strategy to generate chunk_shape, split_shape
+        let ShapeDim { shape, .. } = shape_dim;
+
+        let num_chunks = shape.iter().map(|x| x.array_length()).collect::<Vec<_>>();
+        let split_shape = shape.iter().map(|x| x.chunk_length()).collect::<Vec<_>>();
+
+        let ndim = shape.len();
+        let edges: Vec<Vec<u32>> = (0usize..ndim)
+            .map(|axis| {
+                uniform_manifest_split_edges(
+                    num_chunks[axis] as u32,
+                    &(split_shape[axis] as u32),
+                )
+            })
+            .collect();
+
+        let splits = ManifestSplits::from_edges(edges.into_iter());
+        for edge in splits.iter() {
+            // must be ndim ranges
+            prop_assert_eq!(edge.len(), ndim);
+            for range in edge.iter() {
+                prop_assert!(range.end > range.start);
             }
         }
+
+        // when using from_edges, extents must not exactly overlap
+        for edges in splits.iter().combinations(2) {
+            let is_equal = std::iter::zip(edges[0].iter(), edges[1].iter()).all(
+                |(range1, range2)| {
+                    (range1.start == range2.start) && (range1.end == range2.end)
+                },
+            );
+            prop_assert!(!is_equal);
+        }
     }
 }
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index e56210fb8..0f11af6c3 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -2167,16 +2167,14 @@ mod tests {
         repository::VersionInfo,
         storage::new_in_memory_storage,
         strategies::{
-            ShapeDim, chunk_indices, empty_writable_session, manifest_extents,
-            node_paths, shapes_and_dims,
+            ShapeDim, chunk_indices, empty_writable_session, node_paths, shapes_and_dims,
         },
     };
 
     use super::*;
     use icechunk_macros::tokio_test;
-    use itertools::{Itertools, all, multizip};
+    use itertools::Itertools;
     use pretty_assertions::assert_eq;
-    use proptest::collection::vec;
     use proptest::prelude::{prop_assert, prop_assert_eq};
     use storage::logging::LoggingStorage;
     use test_strategy::proptest;
@@ -2336,186 +2334,6 @@ mod tests {
         prop_assert!(session.delete_group(path.clone()).await.is_ok());
     }
 
-    #[proptest]
-    fn test_property_extents_set_ops_same(
-        #[strategy(manifest_extents(4))] e: ManifestExtents,
-    ) {
-        prop_assert_eq!(e.intersection(&e), Some(e.clone()));
-        prop_assert_eq!(e.union(&e), e.clone());
-        prop_assert_eq!(e.overlap_with(&e), Overlap::Complete);
-    }
-
-    #[proptest]
-    fn test_property_extents_set_ops(
-        #[strategy(manifest_extents(4))] e1: ManifestExtents,
-        #[strategy(manifest_extents(4))] e2: ManifestExtents,
-    ) {
-        let union = e1.union(&e2);
-        let intersection = e1.intersection(&e2);
-
-        prop_assert_eq!(e1.intersection(&union), Some(e1.clone()));
-        prop_assert_eq!(union.intersection(&e1), Some(e1.clone()));
-        prop_assert_eq!(e2.intersection(&union), Some(e2.clone()));
-        prop_assert_eq!(union.intersection(&e2), Some(e2.clone()));
-
-        // order is important for the next 2
-        prop_assert_eq!(e1.overlap_with(&union), Overlap::Complete);
-        prop_assert_eq!(e2.overlap_with(&union), Overlap::Complete);
-
-        if intersection.is_some() {
-            let int = intersection.unwrap();
-            let expected = if e1 == e1 { Overlap::Complete } else { Overlap::Partial };
-            prop_assert_eq!(int.overlap_with(&e1), expected.clone());
-            prop_assert_eq!(int.overlap_with(&e2), expected);
-        } else {
-            prop_assert_eq!(e2.overlap_with(&e1), Overlap::None);
-            prop_assert_eq!(e1.overlap_with(&e2), Overlap::None);
-        }
-    }
-
-    #[proptest]
-    fn test_property_extents_widths(
-        #[strategy(manifest_extents(4))] extent1: ManifestExtents,
-        #[strategy(vec(0..100, 4))] delta_left: Vec<i32>,
-        #[strategy(vec(0..100, 4))] delta_right: Vec<i32>,
-    ) {
-        let widths = extent1.iter().map(|r| (r.end - r.start) as i32).collect::<Vec<_>>();
-        let extent2 = ManifestExtents::from_ranges_iter(
-            multizip((extent1.iter(), delta_left.iter(), delta_right.iter())).map(
-                |(extent, dleft, dright)| {
-                    ((extent.start as i32 + dleft) as u32)
-                        ..((extent.end as i32 + dright) as u32)
-                },
-            ),
-        );
-
-        if all(delta_left.iter(), |elem| elem == &0i32)
-            && all(delta_right.iter(), |elem| elem == &0i32)
-        {
-            prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::Complete);
-        }
-
-        let extent2 = ManifestExtents::from_ranges_iter(
-            multizip((
-                extent1.iter(),
-                widths.iter(),
-                delta_left.iter(),
-                delta_right.iter(),
-            ))
-            .map(|(extent, width, dleft, dright)| {
-                let (low, high) = (dleft.min(dright), dleft.max(dright));
-                ((extent.start as i32 + width + low) as u32)
-                    ..((extent.end as i32 + width + high) as u32)
-            }),
-        );
-
-        prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::None);
-
-        let extent2 = ManifestExtents::from_ranges_iter(
-            multizip((
-                extent1.iter(),
-                widths.iter(),
-                delta_left.iter(),
-                delta_right.iter(),
-            ))
-            .map(|(extent, width, dleft, dright)| {
-                let (low, high) = (dleft.min(dright), dleft.max(dright));
-                ((extent.start as i32 - width - high).max(0i32) as u32)
-                    ..((extent.end as i32 - width - low) as u32)
-            }),
-        );
-        prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::None);
-
-        let extent2 = ManifestExtents::from_ranges_iter(
-            multizip((extent1.iter(), delta_left.iter(), delta_right.iter())).map(
-                |(extent, dleft, dright)| {
-                    ((extent.start as i32 - dleft - 1).max(0i32) as u32)
-                        ..((extent.end as i32 + dright + 1) as u32)
-                },
-            ),
-        );
-        prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::Partial);
-    }
-
-    #[icechunk_macros::test]
-    fn test_overlaps() -> Result<(), Box<dyn Error>> {
-        let e1 = ManifestExtents::new(
-            vec![0u32, 1, 2].as_slice(),
-            vec![2u32, 4, 6].as_slice(),
-        );
-
-        let e2 = ManifestExtents::new(
-            vec![10u32, 1, 2].as_slice(),
-            vec![12u32, 4, 6].as_slice(),
-        );
-
-        let union = ManifestExtents::new(
-            vec![0u32, 1, 2].as_slice(),
-            vec![12u32, 4, 6].as_slice(),
-        );
-
-        assert_eq!(e2.overlap_with(&e1), Overlap::None);
-        assert_eq!(e1.intersection(&e2), None);
-        assert_eq!(e1.union(&e2), union);
-
-        let e1 = ManifestExtents::new(
-            vec![0u32, 1, 2].as_slice(),
-            vec![2u32, 4, 6].as_slice(),
-        );
-        let e2 = ManifestExtents::new(
-            vec![2u32, 1, 2].as_slice(),
-            vec![42u32, 4, 6].as_slice(),
-        );
-        assert_eq!(e2.overlap_with(&e1), Overlap::None);
-        assert_eq!(e1.overlap_with(&e2), Overlap::None);
-
-        // asymmetric case
-        let e1 = ManifestExtents::new(
-            vec![0u32, 1, 2].as_slice(),
-            vec![3u32, 4, 6].as_slice(),
-        );
-        let e2 = ManifestExtents::new(
-            vec![2u32, 1, 2].as_slice(),
-            vec![3u32, 4, 6].as_slice(),
-        );
-        let union = ManifestExtents::new(
-            vec![0u32, 1, 2].as_slice(),
-            vec![3u32, 4, 6].as_slice(),
-        );
-        let intersection = ManifestExtents::new(
-            vec![2u32, 1, 2].as_slice(),
-            vec![3u32, 4, 6].as_slice(),
-        );
-        assert_eq!(e2.overlap_with(&e1), Overlap::Complete);
-        assert_eq!(e1.overlap_with(&e2), Overlap::Partial);
-        assert_eq!(e1.union(&e2), union.clone());
-        assert_eq!(e2.union(&e1), union.clone());
-        assert_eq!(e1.intersection(&e2), Some(intersection));
-
-        // empty set
-        let e1 = ManifestExtents::new(
-            vec![0u32, 1, 2].as_slice(),
-            vec![3u32, 4, 6].as_slice(),
-        );
-        let e2 = ManifestExtents::new(
-            vec![2u32, 1, 2].as_slice(),
-            vec![2u32, 4, 6].as_slice(),
-        );
-        assert_eq!(e1.intersection(&e2), None);
-
-        // this should create non-overlapping extents
-        let splits = ManifestSplits::from_edges(vec![
-            vec![0, 10, 20],
-            vec![0, 1, 2],
-            vec![0, 21, 22],
-        ]);
-        for vec in splits.iter().combinations(2) {
-            assert_eq!(vec[0].overlap_with(vec[1]), Overlap::None);
-            assert_eq!(vec[1].overlap_with(vec[0]), Overlap::None);
-        }
-
-        Ok(())
-    }
     #[proptest(async = "tokio")]
     async fn test_aggregate_extents(
         #[strategy(proptest::collection::vec(chunk_indices(3, 0..1_000_000), 1..50))]

From 9e1c8adc2da1420f22739bff9a36613f1f2ff903 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Fri, 13 Jun 2025 17:22:32 -0600
Subject: [PATCH 31/43] edits

---
 icechunk/src/change_set.rs | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index 085f4f542..04a0ddb6e 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -130,6 +130,7 @@ impl ChangeSet {
         }
 
         // update existing splits
+        let mut to_remove = HashSet::<ChunkIndices>::new();
         if let Some(manifests) = self.set_chunks.remove(node_id) {
             let mut new_deleted_chunks = HashSet::<ChunkIndices>::new();
             let mut new_manifests =
@@ -165,15 +166,18 @@ impl ChangeSet {
             }
 
             // bring back any previously tracked deletes
-            if let Some(deletes) = self.deleted_chunks_outside_bounds.get(node_id) {
+            if let Some(deletes) = self.deleted_chunks_outside_bounds.get_mut(node_id) {
                 for coord in deletes.iter() {
                     if let Some(extents) = new_splits.find(coord) {
                         new_manifests
                             .entry(extents)
                             .or_default()
                             .insert(coord.clone(), None);
+                        to_remove.insert(coord.clone());
                     };
                 }
+                deletes.retain(|item| !to_remove.contains(item));
+                to_remove.drain();
             };
             self.set_chunks.insert(node_id.clone(), new_manifests);
 
@@ -242,16 +246,16 @@ impl ChangeSet {
         // this implementation makes delete idempotent
         // it allows deleting a deleted chunk by repeatedly setting None.
         self.set_chunks
-            .entry(node_id.clone())
+            .entry(node_id)
             .or_insert_with(|| {
                 HashMap::<
                     ManifestExtents,
                     BTreeMap<ChunkIndices, Option<ChunkPayload>>,
                 >::with_capacity(splits.len())
             })
-            .entry(extent.clone())
+            .entry(extent)
             .or_default()
-            .insert(coord.clone(), data.clone());
+            .insert(coord, data);
     }
 
     pub fn get_chunk_ref(

From e974f483da600fb88a54838f813e9e0d16aae810 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Mon, 16 Jun 2025 13:22:30 -0600
Subject: [PATCH 32/43] Add conflicting commits test

---
 icechunk/src/change_set.rs |   1 -
 icechunk/src/repository.rs | 162 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 160 insertions(+), 3 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index 04a0ddb6e..1e0a13ca8 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -77,7 +77,6 @@ impl ChangeSet {
     }
 
     pub fn arrays_with_chunk_changes(&self) -> impl Iterator<Item = &NodeId> {
-        // FIXME: needs test for session with only chunk deletes on existing nodes
         self.set_chunks.keys()
     }
 
diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs
index c91470469..0c1d7f7ef 100644
--- a/icechunk/src/repository.rs
+++ b/icechunk/src/repository.rs
@@ -933,7 +933,7 @@ pub async fn raise_if_invalid_snapshot_id(
 #[cfg(test)]
 #[allow(clippy::panic, clippy::unwrap_used, clippy::expect_used)]
 mod tests {
-    use std::{collections::HashMap, error::Error, path::PathBuf, sync::Arc};
+    use std::{collections::HashMap, error::Error, iter::zip, path::PathBuf, sync::Arc};
 
     use icechunk_macros::tokio_test;
     use itertools::enumerate;
@@ -947,13 +947,14 @@ mod tests {
             ManifestSplitDim, ManifestSplitDimCondition, ManifestSplittingConfig,
             RepositoryConfig,
         },
+        conflicts::basic_solver::BasicConflictSolver,
         format::{
             ByteRange, ChunkIndices,
             manifest::{ChunkPayload, ManifestSplits},
             snapshot::{ArrayShape, DimensionName},
         },
         new_local_filesystem_storage,
-        session::get_chunk,
+        session::{SessionError, get_chunk},
         storage::new_in_memory_storage,
     };
 
@@ -1887,6 +1888,163 @@ mod tests {
             let expected = Bytes::copy_from_slice(format!("{0}", val).as_bytes());
             assert_eq!(actual, expected);
         }
+
+        // now merge two sessions: one with only writes, one with only deletes
+        let mut session1 = repository.writable_session("main").await?;
+        session1
+            .set_chunk_ref(
+                temp_path.clone(),
+                ChunkIndices(indices[0].clone()),
+                Some(ChunkPayload::Inline(format!("{0}", 3).into())),
+            )
+            .await?;
+        session1
+            .set_chunk_ref(
+                temp_path.clone(),
+                ChunkIndices(indices[1].clone()),
+                Some(ChunkPayload::Inline(format!("{0}", 4).into())),
+            )
+            .await?;
+        let mut session2 = repository.writable_session("main").await?;
+        session2
+            .set_chunk_ref(temp_path.clone(), ChunkIndices(indices[2].clone()), None)
+            .await?;
+        session2
+            .set_chunk_ref(temp_path.clone(), ChunkIndices(indices[3].clone()), None)
+            .await?;
+
+        session1.merge(session2).await?;
+        let expected = vec![Some(3), Some(4), None, None];
+        for (expect, idx) in zip(expected.iter(), indices.iter()) {
+            let actual = get_chunk(
+                session1
+                    .get_chunk_reader(
+                        &temp_path,
+                        &ChunkIndices(idx.clone()),
+                        &ByteRange::ALL,
+                    )
+                    .await
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+            let expected_value =
+                expect.map(|val| Bytes::copy_from_slice(format!("{0}", val).as_bytes()));
+            assert_eq!(actual, expected_value);
+        }
+
+        Ok(())
+    }
+
+    #[tokio_test]
+    async fn test_commits_with_conflicting_manifest_splits() -> Result<(), Box<dyn Error>>
+    {
+        let shape = ArrayShape::new(vec![(25, 1), (10, 1), (3, 1), (4, 1)]).unwrap();
+        let dimension_names = Some(vec!["t".into(), "z".into(), "y".into(), "x".into()]);
+        let temp_path: Path = "/temperature".try_into().unwrap();
+
+        let orig_split_sizes = vec![(
+            ManifestSplitCondition::AnyArray,
+            vec![ManifestSplitDim {
+                condition: ManifestSplitDimCondition::DimensionName("t".to_string()),
+                num_chunks: 12u32,
+            }],
+        )];
+        let split_config =
+            ManifestSplittingConfig { split_sizes: Some(orig_split_sizes.clone()) };
+        let backend: Arc<dyn Storage + Send + Sync> = new_in_memory_storage().await?;
+        let repository = create_repo_with_split_manifest_config(
+            &temp_path,
+            &shape,
+            &dimension_names,
+            &split_config,
+            Some(backend),
+        )
+        .await?;
+
+        let indices =
+            vec![vec![0, 0, 1, 0], vec![0, 0, 0, 0], vec![0, 2, 0, 0], vec![0, 2, 0, 1]];
+
+        let mut session1 = repository.writable_session("main").await?;
+        session1
+            .set_chunk_ref(
+                temp_path.clone(),
+                ChunkIndices(indices[0].clone()),
+                Some(ChunkPayload::Inline(format!("{0}", 0).into())),
+            )
+            .await?;
+        session1
+            .set_chunk_ref(
+                temp_path.clone(),
+                ChunkIndices(indices[1].clone()),
+                Some(ChunkPayload::Inline(format!("{0}", 1).into())),
+            )
+            .await?;
+
+        let incompatible_size = 11u32;
+        let incompatible_split_sizes = vec![(
+            ManifestSplitCondition::AnyArray,
+            vec![ManifestSplitDim {
+                condition: ManifestSplitDimCondition::DimensionName("t".to_string()),
+                num_chunks: incompatible_size,
+            }],
+        )];
+        let other_repo = reopen_repo_with_new_splitting_config(
+            &repository,
+            Some(incompatible_split_sizes),
+        );
+
+        assert_ne!(other_repo.config(), repository.config());
+
+        let mut session2 = other_repo.writable_session("main").await?;
+        session2
+            .set_chunk_ref(
+                temp_path.clone(),
+                ChunkIndices(indices[2].clone()),
+                Some(ChunkPayload::Inline(format!("{0}", 2).into())),
+            )
+            .await?;
+        session2
+            .set_chunk_ref(
+                temp_path.clone(),
+                ChunkIndices(indices[3].clone()),
+                Some(ChunkPayload::Inline(format!("{0}", 3).into())),
+            )
+            .await?;
+
+        session1.commit("first commit", None).await?;
+        if let Err(SessionError { kind: SessionErrorKind::Conflict { .. }, .. }) =
+            session2.commit("second commit", None).await
+        {
+            let solver = BasicConflictSolver::default();
+            // different chunks were written so this should fast forward
+            assert!(session2.rebase(&solver).await.is_ok());
+            session2.commit("second commit after rebase", None).await?;
+        } else {
+            panic!("this should have conflicted!");
+        }
+
+        let new_session = repository
+            .readonly_session(&VersionInfo::BranchTipRef("main".into()))
+            .await?;
+        for (val, idx) in enumerate(indices.iter()) {
+            let actual = get_chunk(
+                new_session
+                    .get_chunk_reader(
+                        &temp_path,
+                        &ChunkIndices(idx.clone()),
+                        &ByteRange::ALL,
+                    )
+                    .await
+                    .unwrap(),
+            )
+            .await
+            .unwrap()
+            .expect(&format!("getting chunk ref failed for {:?}", &idx));
+            let expected = Bytes::copy_from_slice(format!("{0}", val).as_bytes());
+            assert_eq!(actual, expected);
+        }
+
         Ok(())
     }
 

From 76e58fd14d25bacaf07e18c16c7475c36d6f801e Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Mon, 16 Jun 2025 14:10:25 -0600
Subject: [PATCH 33/43] Add test for splits changing in a session

---
 icechunk/src/repository.rs | 127 ++++++++++++++++++++++++++++++++++++-
 icechunk/src/session.rs    |   7 +-
 2 files changed, 130 insertions(+), 4 deletions(-)

diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs
index 0c1d7f7ef..62997aa94 100644
--- a/icechunk/src/repository.rs
+++ b/icechunk/src/repository.rs
@@ -1262,7 +1262,8 @@ mod tests {
         assert_manifest_count(&storage, 1).await;
 
         // Important we are not issuing any chunk deletes here (which is what Zarr does)
-        // Note we are still rewriting the manifest
+        // Note we are still rewriting the manifest even without chunk changes
+        // GH604
         let mut session = repo.writable_session("main").await?;
         let shape2 = ArrayShape::new(vec![(2, 1)]).unwrap();
         session
@@ -1291,12 +1292,132 @@ mod tests {
         session.commit("second commit", None).await?;
         assert_manifest_count(&storage, 2).await;
 
-        // FIXME: add more complex splitting test
+        Ok(())
+    }
+
+    #[tokio_test]
+    async fn test_splits_change_in_session() -> Result<(), Box<dyn Error>> {
+        let shape = ArrayShape::new(vec![(13, 1), (2, 1), (1, 1)]).unwrap();
+        let dimension_names = Some(vec!["t".into(), "y".into(), "x".into()]);
+        let new_dimension_names = Some(vec!["time".into(), "y".into(), "x".into()]);
+        let array_path: Path = "/temperature".try_into().unwrap();
+        let array_def = Bytes::from_static(br#"{"this":"other array"}"#);
+
+        // two possible split sizes t: 3, time: 4;
+        // then we rename `t` to `time` 😈
+        let split_sizes = vec![
+            (
+                ManifestSplitCondition::PathMatches { regex: r".*".to_string() },
+                vec![ManifestSplitDim {
+                    condition: ManifestSplitDimCondition::DimensionName(
+                        "^t$".to_string(),
+                    ),
+                    num_chunks: 3,
+                }],
+            ),
+            (
+                ManifestSplitCondition::PathMatches { regex: r".*".to_string() },
+                vec![ManifestSplitDim {
+                    condition: ManifestSplitDimCondition::DimensionName(
+                        "time".to_string(),
+                    ),
+                    num_chunks: 4,
+                }],
+            ),
+        ];
+        let split_config = ManifestSplittingConfig { split_sizes: Some(split_sizes) };
+
+        let backend: Arc<dyn Storage + Send + Sync> = new_in_memory_storage().await?;
+        let logging = Arc::new(LoggingStorage::new(Arc::clone(&backend)));
+        let storage: Arc<dyn Storage + Send + Sync> = logging.clone();
+        let repository = create_repo_with_split_manifest_config(
+            &array_path,
+            &shape,
+            &dimension_names,
+            &split_config,
+            Some(Arc::clone(&storage)),
+        )
+        .await?;
+
+        let verify_data = async |session: &Session, offset: u32| {
+            for idx in 0..12 {
+                let actual = get_chunk(
+                    session
+                        .get_chunk_reader(
+                            &array_path,
+                            &ChunkIndices(vec![idx.clone(), 0, 0]),
+                            &ByteRange::ALL,
+                        )
+                        .await
+                        .unwrap(),
+                )
+                .await
+                .unwrap()
+                .unwrap();
+                let expected = Bytes::copy_from_slice(format!("{0}", idx + offset).as_bytes());
+                assert_eq!(actual, expected);
+            }
+        };
+
+        let mut session = repository.writable_session("main").await?;
+        for i in 0..12 {
+            session
+                .set_chunk_ref(
+                    array_path.clone(),
+                    ChunkIndices(vec![i, 0, 0]),
+                    Some(ChunkPayload::Inline(format!("{0}", i).into())),
+                )
+                .await?
+        }
+        verify_data(&session, 0).await;
+
+        let node = session.get_node(&array_path).await?;
+        let orig_splits = session.lookup_splits(&node.id).cloned();
+        assert_eq!(
+            orig_splits,
+            Some(ManifestSplits::from_edges(vec![
+                vec![0, 3, 6, 9, 12, 13],
+                vec![0, 2],
+                vec![0, 1]
+            ]))
+        );
+
+        // this should update the splits
+        session
+            .update_array(
+                &array_path,
+                shape.clone(),
+                new_dimension_names.clone(),
+                array_def.clone(),
+            )
+            .await?;
+        verify_data(&session, 0).await;
+        let new_splits = session.lookup_splits(&node.id).cloned();
+        assert_eq!(
+            new_splits,
+            Some(ManifestSplits::from_edges(vec![
+                vec![0, 4, 8, 12, 13],
+                vec![0, 2],
+                vec![0, 1]
+            ]))
+        );
+
+        // update data
+        for i in 0..12 {
+            session
+                .set_chunk_ref(
+                    array_path.clone(),
+                    ChunkIndices(vec![i, 0, 0]),
+                    Some(ChunkPayload::Inline(format!("{0}", i+10).into())),
+                )
+                .await?
+        }
+        verify_data(&session, 10).await;
 
         Ok(())
     }
 
-    #[tokio::test]
+    #[tokio_test]
     async fn tests_manifest_splitting_simple() -> Result<(), Box<dyn Error>> {
         let dim_size = 25u32;
         let chunk_size = 1u32;
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 0f11af6c3..57952ec91 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -500,6 +500,10 @@ impl Session {
         self.set_node_chunk_ref(node_snapshot, coord, data).await
     }
 
+    pub fn lookup_splits(&self, node_id: &NodeId) -> Option<&ManifestSplits> {
+        self.splits.get(node_id)
+    }
+
     fn cache_splits(
         &mut self,
         node_id: &NodeId,
@@ -507,9 +511,9 @@ impl Session {
         shape: &ArrayShape,
         dimension_names: &Option<Vec<DimensionName>>,
     ) {
-        // FIXME: handle conflicts here
         // Q: What happens if we set a chunk, then change a dimension name, so
         //    that the split changes.
+        // A: we reorg the existing chunk refs in the changeset to the new splits.
         let splitting = self.config.manifest().splitting();
         let splits = splitting.get_split_sizes(path, shape, dimension_names);
         self.splits.insert(node_id.clone(), splits);
@@ -1842,6 +1846,7 @@ impl ManifestSplittingConfig {
                     } in dim_specs.iter()
                     {
                         if dim_condition.matches(axis, dimname.clone().into()) {
+                            dbg!(&dim_condition, "matched", &dimname);
                             edges[axis] = uniform_manifest_split_edges(
                                 num_chunks[axis],
                                 split_size,

From a306abfdfe531d584f0cb9dffa88e0f377f864a9 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Mon, 16 Jun 2025 15:19:28 -0600
Subject: [PATCH 34/43] lint

---
 icechunk/src/repository.rs | 5 +++--
 icechunk/src/session.rs    | 1 -
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs
index 62997aa94..316acaf34 100644
--- a/icechunk/src/repository.rs
+++ b/icechunk/src/repository.rs
@@ -1354,7 +1354,8 @@ mod tests {
                 .await
                 .unwrap()
                 .unwrap();
-                let expected = Bytes::copy_from_slice(format!("{0}", idx + offset).as_bytes());
+                let expected =
+                    Bytes::copy_from_slice(format!("{0}", idx + offset).as_bytes());
                 assert_eq!(actual, expected);
             }
         };
@@ -1408,7 +1409,7 @@ mod tests {
                 .set_chunk_ref(
                     array_path.clone(),
                     ChunkIndices(vec![i, 0, 0]),
-                    Some(ChunkPayload::Inline(format!("{0}", i+10).into())),
+                    Some(ChunkPayload::Inline(format!("{0}", i + 10).into())),
                 )
                 .await?
         }
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 57952ec91..77d5b372a 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -1846,7 +1846,6 @@ impl ManifestSplittingConfig {
                     } in dim_specs.iter()
                     {
                         if dim_condition.matches(axis, dimname.clone().into()) {
-                            dbg!(&dim_condition, "matched", &dimname);
                             edges[axis] = uniform_manifest_split_edges(
                                 num_chunks[axis],
                                 split_size,

From e719c7f43ee1dcac36e4bf33a796ba83fdb3b639 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Tue, 17 Jun 2025 13:51:58 -0600
Subject: [PATCH 35/43] Review comments

---
 icechunk/src/change_set.rs      |  9 ++--
 icechunk/src/format/manifest.rs |  3 ++
 icechunk/src/repository.rs      |  6 +++
 icechunk/src/session.rs         | 82 ++++++++++++++++++---------------
 icechunk/tests/test_gc.rs       |  3 ++
 5 files changed, 62 insertions(+), 41 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index 1e0a13ca8..25eb8e513 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -169,7 +169,7 @@ impl ChangeSet {
                 for coord in deletes.iter() {
                     if let Some(extents) = new_splits.find(coord) {
                         new_manifests
-                            .entry(extents)
+                            .entry(extents.clone())
                             .or_default()
                             .insert(coord.clone(), None);
                         to_remove.insert(coord.clone());
@@ -252,7 +252,7 @@ impl ChangeSet {
                     BTreeMap<ChunkIndices, Option<ChunkPayload>>,
                 >::with_capacity(splits.len())
             })
-            .entry(extent)
+            .entry(extent.clone())
             .or_default()
             .insert(coord, data);
     }
@@ -263,8 +263,9 @@ impl ChangeSet {
         coords: &ChunkIndices,
     ) -> Option<&Option<ChunkPayload>> {
         self.set_chunks.get(node_id).and_then(|node_chunks| {
-            find_coord(node_chunks.keys(), coords)
-                .and_then(|extent| node_chunks.get(&extent).and_then(|s| s.get(coords)))
+            find_coord(node_chunks.keys(), coords).and_then(|(_, extent)| {
+                node_chunks.get(extent).and_then(|s| s.get(coords))
+            })
         })
     }
 
diff --git a/icechunk/src/format/manifest.rs b/icechunk/src/format/manifest.rs
index eea131b77..d01efae11 100644
--- a/icechunk/src/format/manifest.rs
+++ b/icechunk/src/format/manifest.rs
@@ -166,6 +166,9 @@ impl ManifestSplits {
     }
 
     pub fn compatible_with(&self, other: &Self) -> bool {
+        // this is not a simple zip + all(equals) because
+        // ordering might differ though both sets of splits
+        // must be complete.
         for ours in self.iter() {
             if any(other.iter(), |theirs| {
                 ours.overlap_with(theirs) == Overlap::Partial
diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs
index 316acaf34..9f86530f6 100644
--- a/icechunk/src/repository.rs
+++ b/icechunk/src/repository.rs
@@ -1924,6 +1924,7 @@ mod tests {
             vec![vec![0, 0, 1, 0], vec![0, 0, 0, 0], vec![0, 2, 0, 0], vec![0, 2, 0, 1]];
 
         let mut session1 = repository.writable_session("main").await?;
+        let node_id = session1.get_node(&temp_path).await?.id;
         session1
             .set_chunk_ref(
                 temp_path.clone(),
@@ -1992,7 +1993,12 @@ mod tests {
             )
             .await?;
 
+        // Session.splits should be _complete_ so it should be identical for the same node
+        // on any two sessions with compatible splits
+        let splits = session1.lookup_splits(&node_id).unwrap().clone();
+        assert_eq!(session1.lookup_splits(&node_id), session2.lookup_splits(&node_id));
         session1.merge(session2).await?;
+        assert_eq!(session1.lookup_splits(&node_id), Some(&splits));
         for (val, idx) in enumerate(indices.iter()) {
             let actual = get_chunk(
                 session1
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 77d5b372a..0576b96da 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -109,10 +109,15 @@ pub enum SessionErrorKind {
     InvalidIndex { coords: ChunkIndices, path: Path },
     #[error("invalid chunk index for splitting manifests: {coords:?}")]
     InvalidIndexForSplitManifests { coords: ChunkIndices },
-    #[error("incompatible manifest splitting config when merging sessions")]
-    IncompatibleSplits,
+    #[error("incompatible manifest splitting config when merging two sessions")]
+    IncompatibleSplittingConfig {
+        ours: ManifestSplittingConfig,
+        theirs: ManifestSplittingConfig,
+    },
     #[error("`to` snapshot ancestry doesn't include `from`")]
     BadSnapshotChainForDiff,
+    #[error("failed to create manifest from chunk stream")]
+    ManifestCreationError(#[from] Box<SessionError>),
 }
 
 pub type SessionError = ICError<SessionErrorKind>;
@@ -168,7 +173,10 @@ pub type SessionResult<T> = Result<T, SessionError>;
 // and at read time to choose which manifest to query for chunk payload
 /// It is useful to have this act on an iterator (e.g. get_chunk_ref)
 /// The find method on ManifestSplits is simply a helper.
-pub fn find_coord<'a, I>(mut iter: I, coord: &'a ChunkIndices) -> Option<ManifestExtents>
+pub fn find_coord<'a, I>(
+    iter: I,
+    coord: &'a ChunkIndices,
+) -> Option<(usize, &'a ManifestExtents)>
 where
     I: Iterator<Item = &'a ManifestExtents>,
 {
@@ -182,25 +190,18 @@ where
     // ndim must be the same
     // Note: I don't think we can distinguish between out of bounds index for the array
     //       and an index that is part of a split that hasn't been written yet.
-    iter.find(|e| e.contains(coord.0.as_slice())).cloned()
-}
-
-pub fn position_coord<'a, I>(iter: I, coord: &'a ChunkIndices) -> Option<usize>
-where
-    I: Iterator<Item = &'a ManifestExtents>,
-{
-    enumerate(iter).find(|(_, e)| e.contains(coord.0.as_slice())).map(|x| x.0)
+    enumerate(iter).find(|(_, e)| e.contains(coord.0.as_slice()))
 }
 
 impl ManifestSplits {
-    pub fn find(&self, coord: &ChunkIndices) -> Option<ManifestExtents> {
+    pub fn find<'a>(&'a self, coord: &'a ChunkIndices) -> Option<&'a ManifestExtents> {
         debug_assert_eq!(coord.0.len(), self.0[0].len());
-        find_coord(self.iter(), coord)
+        find_coord(self.iter(), coord).map(|x| x.1)
     }
 
     pub fn position(&self, coord: &ChunkIndices) -> Option<usize> {
         debug_assert_eq!(coord.0.len(), self.0[0].len());
-        position_coord(self.iter(), coord)
+        find_coord(self.iter(), coord).map(|x| x.0)
     }
 }
 
@@ -238,6 +239,8 @@ impl Session {
             snapshot_id,
             change_set: ChangeSet::default(),
             default_commit_metadata: SnapshotProperties::default(),
+            // Splits are populated for a node during
+            // `add_array`, `update_array`, and `set_chunk_ref`
             splits: Default::default(),
         }
     }
@@ -504,6 +507,8 @@ impl Session {
         self.splits.get(node_id)
     }
 
+    /// This method is directly called in add_array & update_array
+    /// where we know we must update the splits HashMap
     fn cache_splits(
         &mut self,
         node_id: &NodeId,
@@ -526,11 +531,13 @@ impl Session {
         shape: &ArrayShape,
         dimension_names: &Option<Vec<DimensionName>>,
     ) -> &ManifestSplits {
-        if !self.splits.contains_key(node_id) {
-            self.cache_splits(node_id, path, shape, dimension_names);
-        }
-        #[allow(clippy::expect_used)]
-        self.splits.get(node_id).expect("splits for node should always exist.")
+        self.splits.entry(node_id.clone()).or_insert_with(|| {
+            self.config.manifest().splitting().get_split_sizes(
+                path,
+                shape,
+                dimension_names,
+            )
+        })
     }
 
     // Helper function that accepts a NodeSnapshot instead of a path,
@@ -810,8 +817,8 @@ impl Session {
             return Ok(None);
         }
 
-        let index = match position_coord(manifests.iter().map(|m| &m.extents), coords) {
-            Some(index) => index,
+        let index = match find_coord(manifests.iter().map(|m| &m.extents), coords) {
+            Some((index, _)) => index,
             // for an invalid coordinate, we bail.
             // This happens for two cases:
             // (1) the "coords" is out-of-range for the array shape
@@ -909,17 +916,24 @@ impl Session {
         if self.read_only() {
             return Err(SessionErrorKind::ReadOnlySession.into());
         }
-        let Session { splits, change_set, .. } = other;
+        let Session { splits: other_splits, change_set, .. } = other;
 
         if self.splits.iter().any(|(node, our_splits)| {
-            splits
+            other_splits
                 .get(node)
                 .is_some_and(|their_splits| !our_splits.compatible_with(their_splits))
         }) {
-            return Err(SessionErrorKind::IncompatibleSplits.into());
+            let ours = self.config().manifest().splitting().clone();
+            let theirs = self.config().manifest().splitting().clone();
+            return Err(
+                SessionErrorKind::IncompatibleSplittingConfig { ours, theirs }.into()
+            );
         }
 
-        self.splits.extend(splits);
+        // Session.splits is _complete_ in that it will include every possible split.
+        // So a simple `extend` is fine, if the same node appears in two sessions,
+        // it must have the same splits and overwriting is fine.
+        self.splits.extend(other_splits);
         self.change_set.merge(change_set);
         Ok(())
     }
@@ -1331,7 +1345,6 @@ async fn verified_node_chunk_iterator<'a>(
                                                 payload,
                                             });
 
-                                        // FIXME: I don't understand this
                                         let old_chunks = change_set
                                             .update_existing_chunks(
                                                 node_id_c3, old_chunks,
@@ -1614,10 +1627,9 @@ impl<'a> FlushProcess<'a> {
         let mut to = vec![];
         let chunks = aggregate_extents(&mut from, &mut to, chunks, |ci| &ci.coord);
 
-        #[allow(clippy::expect_used)]
         if let Some(new_manifest) = Manifest::from_stream(chunks)
             .await
-            .expect("failed to create manifest from chunk stream")
+            .map_err(|e| SessionErrorKind::ManifestCreationError(Box::new(e)))?
         {
             let new_manifest = Arc::new(new_manifest);
             let new_manifest_size =
@@ -1648,7 +1660,6 @@ impl<'a> FlushProcess<'a> {
         let splits =
             self.splits.get(node_id).expect("getting split for node unexpectedly failed");
 
-        // TODO: this could be try_fold with the refs HashMap as state
         for extent in splits.iter() {
             if self.change_set.array_manifest(node_id, extent).is_some() {
                 let chunks = stream::iter(
@@ -1676,7 +1687,7 @@ impl<'a> FlushProcess<'a> {
     async fn write_manifest_for_existing_node(
         &mut self,
         node: &NodeSnapshot,
-        manifests: Vec<ManifestRef>,
+        existing_manifests: Vec<ManifestRef>,
         old_snapshot: &Snapshot,
     ) -> SessionResult<()> {
         #[allow(clippy::expect_used)]
@@ -1686,18 +1697,17 @@ impl<'a> FlushProcess<'a> {
             HashMap::<ManifestExtents, Vec<ManifestRef>>::with_capacity(splits.len());
 
         let on_disk_extents =
-            manifests.iter().map(|m| m.extents.clone()).collect::<Vec<_>>();
+            existing_manifests.iter().map(|m| m.extents.clone()).collect::<Vec<_>>();
 
         let modified_splits = self
             .change_set
             .modified_manifest_extents_iterator(&node.id, &node.path)
             .collect::<HashSet<_>>();
 
-        // FIXME: there is an invariant here
         // ``modified_splits`` (i.e. splits used in this session)
         // must be a subset of ``splits`` (the splits set in the config)
+        debug_assert!(modified_splits.is_subset(&splits.iter().collect::<HashSet<_>>()));
 
-        // TODO: this should be try_fold with the refs HashMap as state
         for extent in splits.iter() {
             if modified_splits.contains(extent) {
                 // this split was modified in this session, rewrite it completely
@@ -1705,6 +1715,7 @@ impl<'a> FlushProcess<'a> {
                     .await?
                     .map(|new_ref| refs.insert(extent.clone(), vec![new_ref]));
             } else {
+                // intersection of the current split with extents on disk
                 let on_disk_bbox = on_disk_extents
                     .iter()
                     .filter_map(|e| e.intersection(extent))
@@ -1712,7 +1723,7 @@ impl<'a> FlushProcess<'a> {
 
                 // split was unmodified in this session. Let's look at the current manifests
                 // and see what we need to do with them
-                for old_ref in manifests.iter() {
+                for old_ref in existing_manifests.iter() {
                     // Remember that the extents written to disk are the `from`:`to` ranges
                     // of populated chunks
                     match old_ref.extents.overlap_with(extent) {
@@ -3330,10 +3341,8 @@ mod tests {
         ds.add_array(a2path.clone(), shape.clone(), dimension_names.clone(), def.clone())
             .await?;
 
-        dbg!("added arrays, now commit");
         let _ = ds.commit("first commit", None).await?;
 
-        dbg!("committed arrays");
         // there should be no manifests yet because we didn't add any chunks
         assert_eq!(
             0,
@@ -3358,7 +3367,6 @@ mod tests {
 
         let mut ds = repo.writable_session("main").await?;
 
-        dbg!("setting chunk ref");
         // add 3 chunks
         ds.set_chunk_ref(
             a1path.clone(),
diff --git a/icechunk/tests/test_gc.rs b/icechunk/tests/test_gc.rs
index 82da26a64..ac5d53535 100644
--- a/icechunk/tests/test_gc.rs
+++ b/icechunk/tests/test_gc.rs
@@ -67,6 +67,7 @@ pub async fn do_test_gc(
     let storage_settings = storage.default_settings();
 
     let shape = ArrayShape::new(vec![(1100, 1)]).unwrap();
+    // intentionally small to create garbage
     let manifest_split_size = 10;
     let split_sizes = Some(vec![(
         ManifestSplitCondition::PathMatches { regex: r".*".to_string() },
@@ -113,6 +114,7 @@ pub async fn do_test_gc(
     let mut ds = repo.writable_session("main").await?;
 
     // overwrite 10 chunks
+    // This will only overwrite one split manifest.
     for idx in 0..10 {
         let bytes = Bytes::copy_from_slice(&0i8.to_be_bytes());
         let payload = ds.get_chunk_writer()(bytes.clone()).await?;
@@ -167,6 +169,7 @@ pub async fn do_test_gc(
     )
     .await?;
     assert_eq!(summary.chunks_deleted, 10);
+    // only one manifest was re-created, so there is only one garbage manifest
     assert_eq!(summary.manifests_deleted, 1);
     assert_eq!(summary.snapshots_deleted, 1);
     assert!(summary.bytes_deleted > summary.chunks_deleted);

From b7fb3855d81091c38c3bd80033c807f5801cefc3 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Tue, 17 Jun 2025 14:18:59 -0600
Subject: [PATCH 36/43] Use ManifestExtents::ALL sentinel

---
 icechunk/src/change_set.rs      | 10 ++++------
 icechunk/src/format/manifest.rs | 22 ++++++++++++++++++++-
 icechunk/src/session.rs         | 35 +++++++++++++--------------------
 3 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs
index 25eb8e513..444c031b9 100644
--- a/icechunk/src/change_set.rs
+++ b/icechunk/src/change_set.rs
@@ -297,7 +297,7 @@ impl ChangeSet {
         &self,
         node_id: &NodeId,
         node_path: &Path,
-        extent: Option<ManifestExtents>,
+        extent: ManifestExtents,
     ) -> impl Iterator<Item = (&ChunkIndices, &Option<ChunkPayload>)> + use<'_> {
         if self.is_deleted(node_path, node_id) {
             return Either::Left(iter::empty());
@@ -306,9 +306,7 @@ impl ChangeSet {
             None => Either::Left(iter::empty()),
             Some(h) => Either::Right(
                 h.iter()
-                    .filter(move |(manifest_extent, _)| {
-                        extent.as_ref().is_none_or(|e| e == *manifest_extent)
-                    })
+                    .filter(move |(manifest_extent, _)| extent.matches(manifest_extent))
                     .flat_map(|(_, manifest)| manifest.iter()),
             ),
         }
@@ -318,7 +316,7 @@ impl ChangeSet {
         &self,
     ) -> impl Iterator<Item = (Path, ChunkInfo)> + use<'_> {
         self.new_arrays.iter().flat_map(|(path, (node_id, _))| {
-            self.new_array_chunk_iterator(node_id, path, None)
+            self.new_array_chunk_iterator(node_id, path, ManifestExtents::ALL)
                 .map(|ci| (path.clone(), ci))
         })
     }
@@ -327,7 +325,7 @@ impl ChangeSet {
         &'a self,
         node_id: &'a NodeId,
         node_path: &Path,
-        extent: Option<ManifestExtents>,
+        extent: ManifestExtents,
     ) -> impl Iterator<Item = ChunkInfo> + use<'a> {
         self.array_chunks_iterator(node_id, node_path, extent).filter_map(
             move |(coords, payload)| {
diff --git a/icechunk/src/format/manifest.rs b/icechunk/src/format/manifest.rs
index d01efae11..37fe3d400 100644
--- a/icechunk/src/format/manifest.rs
+++ b/icechunk/src/format/manifest.rs
@@ -36,6 +36,9 @@ pub enum Overlap {
 pub struct ManifestExtents(Vec<Range<u32>>);
 
 impl ManifestExtents {
+    // sentinel for a "universal set"
+    pub const ALL: Self = Self(Vec::new());
+
     pub fn new(from: &[u32], to: &[u32]) -> Self {
         let v = from
             .iter()
@@ -66,6 +69,10 @@ impl ManifestExtents {
     }
 
     pub fn intersection(&self, other: &Self) -> Option<Self> {
+        if self == &Self::ALL {
+            return Some(other.clone());
+        }
+
         debug_assert_eq!(self.len(), other.len());
         let ranges = zip(self.iter(), other.iter())
             .map(|(a, b)| max(a.start, b.start)..min(a.end, b.end))
@@ -74,6 +81,9 @@ impl ManifestExtents {
     }
 
     pub fn union(&self, other: &Self) -> Self {
+        if self == &Self::ALL {
+            return Self::ALL;
+        }
         debug_assert_eq!(self.len(), other.len());
         Self::from_ranges_iter(
             zip(self.iter(), other.iter())
@@ -83,13 +93,17 @@ impl ManifestExtents {
 
     pub fn overlap_with(&self, other: &Self) -> Overlap {
         // Important: this is not symmetric.
+        if *other == Self::ALL {
+            return Overlap::Complete;
+        } else if *self == Self::ALL {
+            return Overlap::Partial;
+        }
         debug_assert!(
             self.len() == other.len(),
             "Length mismatch: self = {:?}, other = {:?}",
             &self,
             &other
         );
-
         let mut overlap = Overlap::Complete;
         for (a, b) in zip(other.iter(), self.iter()) {
             debug_assert!(a.start <= a.end, "Invalid range: {:?}", a.clone());
@@ -102,6 +116,12 @@ impl ManifestExtents {
         }
         overlap
     }
+
+    pub fn matches(&self, other: &ManifestExtents) -> bool {
+        // used in `.filter`
+        // ALL always matches any other extents
+        if *self == Self::ALL { true } else { self == other }
+    }
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 0576b96da..8690bb404 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -628,7 +628,7 @@ impl Session {
             &self.change_set,
             &self.snapshot_id,
             path,
-            None,
+            ManifestExtents::ALL,
         )
         .await
     }
@@ -870,7 +870,7 @@ impl Session {
             &self.change_set,
             &self.snapshot_id,
             node.clone(),
-            None,
+            ManifestExtents::ALL,
         )
         .await
         .map_ok(|(_path, chunk_info)| chunk_info.coord);
@@ -878,7 +878,7 @@ impl Session {
         let res = try_stream! {
             let new_chunks = stream::iter(
                 self.change_set
-                    .new_array_chunk_iterator(&node.id, array_path, None)
+                    .new_array_chunk_iterator(&node.id, array_path, ManifestExtents::ALL)
                     .map(|chunk_info| Ok::<ChunkIndices, SessionError>(chunk_info.coord)),
             );
 
@@ -1210,7 +1210,7 @@ async fn updated_chunk_iterator<'a>(
             change_set,
             snapshot_id,
             node,
-            None,
+            ManifestExtents::ALL,
         )
         .await)
     });
@@ -1222,7 +1222,7 @@ async fn updated_node_chunks_iterator<'a>(
     change_set: &'a ChangeSet,
     snapshot_id: &'a SnapshotId,
     node: NodeSnapshot,
-    extent: Option<ManifestExtents>,
+    extent: ManifestExtents,
 ) -> impl Stream<Item = SessionResult<(Path, ChunkInfo)>> + 'a {
     // This iterator should yield chunks for existing arrays + any updates.
     // we check for deletion here in the case that `path` exists in the snapshot,
@@ -1252,7 +1252,7 @@ async fn node_chunk_iterator<'a>(
     change_set: &'a ChangeSet,
     snapshot_id: &'a SnapshotId,
     path: &Path,
-    extent: Option<ManifestExtents>,
+    extent: ManifestExtents,
 ) -> impl Stream<Item = SessionResult<ChunkInfo>> + 'a + use<'a> {
     match get_node(asset_manager, change_set, snapshot_id, path).await {
         Ok(node) => futures::future::Either::Left(
@@ -1275,7 +1275,7 @@ async fn verified_node_chunk_iterator<'a>(
     snapshot_id: &'a SnapshotId,
     change_set: &'a ChangeSet,
     node: NodeSnapshot,
-    extent: Option<ManifestExtents>,
+    extent: ManifestExtents,
 ) -> impl Stream<Item = SessionResult<ChunkInfo>> + 'a {
     match node.node_data {
         NodeData::Group => futures::future::Either::Left(futures::stream::empty()),
@@ -1308,9 +1308,10 @@ async fn verified_node_chunk_iterator<'a>(
                 futures::stream::iter(new_chunks).chain(
                     futures::stream::iter(manifests)
                         .filter(move |manifest_ref| {
-                            futures::future::ready(extent.as_ref().is_none_or(|e| {
-                                e.overlap_with(&manifest_ref.extents) != Overlap::None
-                            }))
+                            futures::future::ready(
+                                extent.overlap_with(&manifest_ref.extents)
+                                    != Overlap::None,
+                            )
                         })
                         .then(move |manifest_ref| {
                             let new_chunk_indices = new_chunk_indices.clone();
@@ -1333,11 +1334,7 @@ async fn verified_node_chunk_iterator<'a>(
                                                 !new_chunk_indices.contains(coord)
                                                     // If the manifest we are parsing partially overlaps with `extent`,
                                                     // we need to filter all coordinates
-                                                    && extent_c2.as_ref().is_none_or(
-                                                        move |e| {
-                                                            e.contains(coord.0.as_slice())
-                                                        },
-                                                    )
+                                                    && extent_c2.contains(coord.0.as_slice())
                                             })
                                             .map_ok(move |(coord, payload)| ChunkInfo {
                                                 node: node_id_c2.clone(),
@@ -1612,7 +1609,7 @@ impl<'a> FlushProcess<'a> {
             self.change_set,
             self.parent_id,
             node.clone(),
-            Some(extent.clone()),
+            extent.clone(),
         )
         .await
         .map_ok(|(_path, chunk_info)| chunk_info);
@@ -1664,11 +1661,7 @@ impl<'a> FlushProcess<'a> {
             if self.change_set.array_manifest(node_id, extent).is_some() {
                 let chunks = stream::iter(
                     self.change_set
-                        .new_array_chunk_iterator(
-                            node_id,
-                            node_path,
-                            Some(extent.clone()),
-                        )
+                        .new_array_chunk_iterator(node_id, node_path, extent.clone())
                         .map(Ok),
                 );
                 #[allow(clippy::expect_used)]

From 4a2e455f80d9b0c0fc2d8fc7e159b7b6bddbe8c2 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Wed, 18 Jun 2025 16:06:32 -0600
Subject: [PATCH 37/43] Update stateful tests

---
 .../tests/test_zarr/test_stateful.py          | 178 +++++++++++++++---
 icechunk/src/repository.rs                    |  40 ++++
 icechunk/src/session.rs                       |   7 +-
 3 files changed, 199 insertions(+), 26 deletions(-)

diff --git a/icechunk-python/tests/test_zarr/test_stateful.py b/icechunk-python/tests/test_zarr/test_stateful.py
index 17e1c7f85..f94938f85 100644
--- a/icechunk-python/tests/test_zarr/test_stateful.py
+++ b/icechunk-python/tests/test_zarr/test_stateful.py
@@ -1,4 +1,5 @@
 import json
+from collections.abc import Iterable
 from typing import Any
 
 import hypothesis.extra.numpy as npst
@@ -14,8 +15,9 @@
     run_state_machine_as_test,
 )
 
+import icechunk as ic
 import zarr
-from icechunk import Repository, in_memory_storage
+from icechunk import Repository, Storage, in_memory_storage
 from zarr.core.buffer import default_buffer_prototype
 from zarr.testing.stateful import ZarrHierarchyStateMachine
 from zarr.testing.strategies import (
@@ -23,10 +25,64 @@
     node_names,
     np_array_and_chunks,
     numpy_arrays,
+    orthogonal_indices,
 )
 
 PROTOTYPE = default_buffer_prototype()
 
+pytestmark = [
+    pytest.mark.filterwarnings(
+        "ignore::zarr.core.dtype.common.UnstableSpecificationWarning"
+    ),
+]
+
+
+import functools
+
+
+def with_frequency(frequency):
+    """
+    Decorator to control how frequently a rule runs in Hypothesis stateful tests.
+
+    Args:
+        frequency: Float between 0 and 1, where 1.0 means always run,
+                  0.1 means run ~10% of the time, etc.
+
+    Usage:
+        @rule()
+        @with_frequency(0.1)  # Run ~10% of the time
+        def rare_operation(self):
+            pass
+    """
+
+    def decorator(func):
+        # Create a counter attribute name specific to this function
+        counter_attr = f"__{func.__name__}_counter"
+
+        @functools.wraps(func)
+        def wrapper(self, *args, **kwargs):
+            return func(self, *args, **kwargs)
+
+        # Add precondition that checks frequency
+        @precondition
+        def frequency_check(self):
+            # Initialize counter if it doesn't exist
+            if not hasattr(self, counter_attr):
+                setattr(self, counter_attr, 0)
+
+            # Increment counter
+            current_count = getattr(self, counter_attr) + 1
+            setattr(self, counter_attr, current_count)
+
+            # Check if we should run based on frequency
+            # This gives roughly the right frequency over many calls
+            return (current_count * frequency) % 1.0 >= (1.0 - frequency)
+
+        # Apply the precondition to the wrapped function
+        return frequency_check(wrapper)
+
+    return decorator
+
 
 @st.composite
 def chunk_paths(
@@ -41,14 +97,66 @@ def chunk_paths(
     return "/".join(map(str, blockidx[subset_slicer]))
 
 
+@st.composite
+def splitting_configs(
+    draw: st.DrawFn, *, arrays: Iterable[zarr.Array]
+) -> ic.ManifestSplittingConfig:
+    config_dict = {}
+    for array in arrays:
+        if draw(st.booleans()):
+            array_condition = ic.ManifestSplitCondition.name_matches(
+                array.path.split("/")[-1]
+            )
+        else:
+            array_condition = ic.ManifestSplitCondition.path_matches(array.path)
+        dimnames = array.metadata.dimension_names or (None,) * array.ndim
+        dimsize_axis_names = draw(
+            st.lists(
+                st.sampled_from(
+                    tuple(zip(array.shape, range(array.ndim), dimnames, strict=False))
+                ),
+                min_size=1,
+                unique=True,
+            )
+        )
+        for size, axis, dimname in dimsize_axis_names:
+            if dimname is None or draw(st.booleans()):
+                key = ic.ManifestSplitDimCondition.Axis(axis)
+            else:
+                key = ic.ManifestSplitDimCondition.DimensionName(dimname)
+            config_dict[array_condition] = {
+                key: draw(st.integers(min_value=1, max_value=size + 10))
+            }
+        return ic.ManifestSplittingConfig.from_dict(config_dict)
+
+
 # TODO: more before/after commit invariants?
 # TODO: add "/" to self.all_groups, deleting "/" seems to be problematic
 class ModifiedZarrHierarchyStateMachine(ZarrHierarchyStateMachine):
-    def __init__(self, repo: Repository) -> None:
-        self.repo = repo
-        store = repo.writable_session("main").store
+    def __init__(self, storage: Storage) -> None:
+        self.storage = storage
+        self.repo = Repository.create(self.storage)
+        store = self.repo.writable_session("main").store
         super().__init__(store)
 
+    @precondition(
+        lambda self: not self.store.session.has_uncommitted_changes
+        and bool(self.all_arrays)
+    )
+    @rule(data=st.data())
+    def reopen_with_config(self, data):
+        array_paths = data.draw(
+            st.lists(st.sampled_from(sorted(self.all_arrays)), max_size=3, unique=True)
+        )
+        arrays = tuple(zarr.open_array(self.model, path=path) for path in array_paths)
+        sconfig = data.draw(splitting_configs(arrays=arrays))
+        config = ic.RepositoryConfig(
+            inline_chunk_threshold_bytes=0, manifest=ic.ManifestConfig(splitting=sconfig)
+        )
+        note(f"reopening with splitting config {sconfig=!r}")
+        self.repo = Repository.open(self.storage, config=config)
+        self.store = self.repo.writable_session("main").store
+
     @precondition(lambda self: self.store.session.has_uncommitted_changes)
     @rule(data=st.data())
     def commit_with_check(self, data) -> None:
@@ -110,8 +218,49 @@ def add_array(
         assume(array.size > 0)
         super().add_array(data, name, array_and_chunks)
 
+    @precondition(lambda self: bool(self.all_groups))
+    @rule(data=st.data())
+    def check_list_dir(self, data: st.DataObject) -> None:
+        path = self.draw_directory(data)
+        note(f"list_dir for {path=!r}")
+        model_ls = sorted(self._sync_iter(self.model.list_dir(path)))
+        store_ls = sorted(self._sync_iter(self.store.list_dir(path)))
+        if model_ls != store_ls and set(model_ls).symmetric_difference(set(store_ls)) != {
+            "c"
+        }:
+            # Consider .list_dir("path/to/array") for an array with a single chunk.
+            # The MemoryStore model will return `"c", "zarr.json"` only if the chunk exists
+            # If that chunk was deleted, then `"c"` is not returned.
+            # LocalStore will not have this behaviour :/
+            # In Icechunk, we always return the `c` so ignore this inconsistency.
+            assert model_ls == store_ls, (model_ls, store_ls)
+
     #####  TODO: port everything below to zarr
+    @precondition(lambda self: bool(self.all_arrays))
+    @rule(data=st.data())
+    def check_array(self, data: st.DataObject) -> None:
+        path = data.draw(st.sampled_from(sorted(self.all_arrays)))
+        actual = zarr.open_array(self.store, path=path)[:]
+        expected = zarr.open_array(self.model, path=path)[:]
+        np.testing.assert_equal(actual, expected)
+
+    @precondition(lambda self: bool(self.all_arrays))
+    @rule(data=st.data())
+    def overwrite_array_orthogonal_indexing(self, data: st.DataObject) -> None:
+        array = data.draw(st.sampled_from(sorted(self.all_arrays)))
+        model_array = zarr.open_array(path=array, store=self.model)
+        store_array = zarr.open_array(path=array, store=self.store)
+        indexer, _ = data.draw(orthogonal_indices(shape=model_array.shape))
+        note(f"overwriting array orthogonal {indexer=}")
+        new_data = data.draw(
+            npst.arrays(shape=model_array.oindex[indexer].shape, dtype=model_array.dtype)
+        )
+        model_array.oindex[indexer] = new_data
+        store_array.oindex[indexer] = new_data
+
+    #####  TODO: delete after next Zarr release (Jun 18, 2025)
     @rule()
+    @with_frequency(0.25)
     def clear(self) -> None:
         note("clearing")
         import zarr
@@ -154,23 +303,6 @@ def draw_directory(self, data) -> str:
             path = array_or_group
         return path
 
-    @precondition(lambda self: bool(self.all_groups))
-    @rule(data=st.data())
-    def check_list_dir(self, data) -> None:
-        path = self.draw_directory(data)
-        note(f"list_dir for {path=!r}")
-        model_ls = sorted(self._sync_iter(self.model.list_dir(path)))
-        store_ls = sorted(self._sync_iter(self.store.list_dir(path)))
-        if model_ls != store_ls and set(model_ls).symmetric_difference(set(store_ls)) != {
-            "c"
-        }:
-            # Consider .list_dir("path/to/array") for an array with a single chunk.
-            # The MemoryStore model will return `"c", "zarr.json"` only if the chunk exists
-            # If that chunk was deleted, then `"c"` is not returned.
-            # LocalStore will not have this behaviour :/
-            # In Icechunk, we always return the `c` so ignore this inconsistency.
-            assert model_ls == store_ls, (model_ls, store_ls)
-
     @precondition(lambda self: bool(self.all_arrays))
     @rule(data=st.data())
     def delete_chunk(self, data) -> None:
@@ -247,10 +379,8 @@ def check_list_prefix_from_root(self) -> None:
 
 
 def test_zarr_hierarchy() -> None:
-    repo = Repository.create(in_memory_storage())
-
     def mk_test_instance_sync() -> ModifiedZarrHierarchyStateMachine:
-        return ModifiedZarrHierarchyStateMachine(repo)
+        return ModifiedZarrHierarchyStateMachine(in_memory_storage())
 
     run_state_machine_as_test(
         mk_test_instance_sync, settings=Settings(report_multiple_bugs=False)
diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs
index 9f86530f6..919c0848d 100644
--- a/icechunk/src/repository.rs
+++ b/icechunk/src/repository.rs
@@ -1578,6 +1578,46 @@ mod tests {
             assert_eq!(val, Bytes::copy_from_slice(format!("{0}", i).as_bytes()));
         }
 
+        // delete all chunks
+        let mut session = repository.writable_session("main").await?;
+        for i in 0..dim_size {
+            session
+                .set_chunk_ref(temp_path.clone(), ChunkIndices(vec![i, 0, 0]), None)
+                .await?;
+        }
+        total_manifests += 0;
+        session.commit("clear existing array", None).await?;
+        assert_manifest_count(&storage, total_manifests).await;
+
+        // add a new array
+        let def = Bytes::from_static(br#"{"this":"array"}"#);
+        let array_path: Path = "/array2".to_string().try_into().unwrap();
+        let mut session = repository.writable_session("main").await?;
+        session
+            .add_array(
+                array_path.clone(),
+                shape.clone(),
+                dimension_names.clone(),
+                def.clone(),
+            )
+            .await?;
+        // set a chunk
+        session
+            .set_chunk_ref(
+                array_path.clone(),
+                ChunkIndices(vec![1, 0, 0]),
+                Some(ChunkPayload::Inline(format!("{0}", 10).into())),
+            )
+            .await?;
+        // delete that chunk, so the chunks iterator is empty
+        // regression test for bug found by hypothesis
+        session
+            .set_chunk_ref(array_path.clone(), ChunkIndices(vec![1, 0, 0]), None)
+            .await?;
+        total_manifests += 0;
+        session.commit("clear new array", None).await?;
+        assert_manifest_count(&storage, total_manifests).await;
+
         Ok(())
     }
 
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 8690bb404..1e0777b04 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -1665,10 +1665,13 @@ impl<'a> FlushProcess<'a> {
                         .map(Ok),
                 );
                 #[allow(clippy::expect_used)]
-                let new_ref = self.write_manifest_from_iterator(chunks).await?.expect(
+                let new_ref = self.write_manifest_from_iterator(chunks).await.expect(
                     "logic bug. for a new node, we must always write the manifest",
                 );
-                self.manifest_refs.entry(node_id.clone()).or_default().push(new_ref);
+                // new_ref is None if there were no chunks in the iterator
+                if let Some(new_ref) = new_ref {
+                    self.manifest_refs.entry(node_id.clone()).or_default().push(new_ref);
+                }
             }
         }
         Ok(())

From b179dbc390fdfc71bef5e5df8f0c4ed2d85402af Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Fri, 20 Jun 2025 14:29:30 -0600
Subject: [PATCH 38/43] update benchmarks dep group

---
 icechunk-python/pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/icechunk-python/pyproject.toml b/icechunk-python/pyproject.toml
index 478bf3a0e..ee2db15b6 100644
--- a/icechunk-python/pyproject.toml
+++ b/icechunk-python/pyproject.toml
@@ -51,6 +51,7 @@ benchmark = [
   "humanize",
   "platformdirs",
   "ipdb",
+  "coiled",
 ]
 docs = [
   "scipy",

From e184e2998a54632db113fee87762ad2fff57df3f Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Fri, 20 Jun 2025 16:14:28 -0600
Subject: [PATCH 39/43] small edits

---
 icechunk-python/benchmarks/conftest.py           | 10 +++++-----
 icechunk-python/tests/test_zarr/test_stateful.py | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/icechunk-python/benchmarks/conftest.py b/icechunk-python/benchmarks/conftest.py
index d45e2b300..2617b1c20 100644
--- a/icechunk-python/benchmarks/conftest.py
+++ b/icechunk-python/benchmarks/conftest.py
@@ -65,11 +65,11 @@ def large_write_dataset(request) -> BenchmarkWriteDataset:
 
 @pytest.fixture(
     params=[
-        # pytest.param(GB_8MB_CHUNKS, id="gb-8mb"),
-        # pytest.param(GB_128MB_CHUNKS, id="gb-128mb"),
-        # pytest.param(ERA5_SINGLE, id="era5-single"),
-        # pytest.param(ERA5, id="era5-weatherbench"),
-        # pytest.param(ERA5_ARCO, id="era5-arco"),
+        pytest.param(GB_8MB_CHUNKS, id="gb-8mb"),
+        pytest.param(GB_128MB_CHUNKS, id="gb-128mb"),
+        pytest.param(ERA5_SINGLE, id="era5-single"),
+        pytest.param(ERA5, id="era5-weatherbench"),
+        pytest.param(ERA5_ARCO, id="era5-arco"),
         pytest.param(LARGE_MANIFEST_UNSHARDED, id="large-manifest-no-split"),
         pytest.param(
             LARGE_MANIFEST_SHARDED,
diff --git a/icechunk-python/tests/test_zarr/test_stateful.py b/icechunk-python/tests/test_zarr/test_stateful.py
index f94938f85..3ad392b69 100644
--- a/icechunk-python/tests/test_zarr/test_stateful.py
+++ b/icechunk-python/tests/test_zarr/test_stateful.py
@@ -30,11 +30,11 @@
 
 PROTOTYPE = default_buffer_prototype()
 
-pytestmark = [
-    pytest.mark.filterwarnings(
-        "ignore::zarr.core.dtype.common.UnstableSpecificationWarning"
-    ),
-]
+# pytestmark = [
+#     pytest.mark.filterwarnings(
+#         "ignore::zarr.core.dtype.common.UnstableSpecificationWarning"
+#     ),
+# ]
 
 
 import functools

From 3b522597aa9c4ceecc5ae8a3097ebf24064faa41 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Sat, 21 Jun 2025 17:35:58 -0600
Subject: [PATCH 40/43] Parallel writes

---
 icechunk/src/session.rs | 189 +++++++++++++++++++++++-----------------
 1 file changed, 107 insertions(+), 82 deletions(-)

diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index 1e0777b04..a7e8650c5 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -12,7 +12,9 @@ use async_stream::try_stream;
 use bytes::Bytes;
 use chrono::{DateTime, Utc};
 use err_into::ErrorInto;
-use futures::{FutureExt, Stream, StreamExt, TryStreamExt, future::Either, stream};
+use futures::{
+    FutureExt, Stream, StreamExt, TryStream, TryStreamExt, future::Either, stream,
+};
 use itertools::{Itertools as _, enumerate, repeat_n};
 use regex::bytes::Regex;
 use serde::{Deserialize, Serialize};
@@ -1572,6 +1574,40 @@ pub fn construct_valid_byte_range(
     }
 }
 
+async fn write_manifest_from_iterator(
+    asset_manager: &AssetManager,
+    chunks: impl Stream<Item = SessionResult<ChunkInfo>>,
+) -> SessionResult<Option<(ManifestRef, ManifestFileInfo)>> {
+    let mut from = vec![];
+    let mut to = vec![];
+    let chunks = aggregate_extents(&mut from, &mut to, chunks, |ci| &ci.coord);
+
+    if let Some(new_manifest) = Manifest::from_stream(chunks)
+        .await
+        .map_err(|e| SessionErrorKind::ManifestCreationError(Box::new(e)))?
+    {
+        let new_manifest = Arc::new(new_manifest);
+        let new_manifest_size =
+            asset_manager.write_manifest(Arc::clone(&new_manifest)).await?;
+
+        let file_info = ManifestFileInfo::new(new_manifest.as_ref(), new_manifest_size);
+
+        let new_ref = ManifestRef {
+            object_id: new_manifest.id().clone(),
+            extents: ManifestExtents::new(&from, &to),
+        };
+        Ok(Some((new_ref, file_info)))
+    } else {
+        Ok(None)
+    }
+}
+
+fn map_ok_second<A, B, E>(
+    stream: impl TryStream<Item = Result<(A, B), E>, Ok = (A, B), Error = E>,
+) -> impl TryStream<Item = Result<B, E>, Ok = B, Error = E> {
+    stream.map_ok(|(_, second)| second)
+}
+
 struct FlushProcess<'a> {
     asset_manager: Arc<AssetManager>,
     change_set: &'a ChangeSet,
@@ -1598,54 +1634,6 @@ impl<'a> FlushProcess<'a> {
         }
     }
 
-    async fn write_manifest_for_updated_chunks(
-        &mut self,
-        node: &NodeSnapshot,
-        extent: &ManifestExtents,
-    ) -> SessionResult<Option<ManifestRef>> {
-        let asset_manager = Arc::clone(&self.asset_manager);
-        let updated_chunks = updated_node_chunks_iterator(
-            asset_manager.as_ref(),
-            self.change_set,
-            self.parent_id,
-            node.clone(),
-            extent.clone(),
-        )
-        .await
-        .map_ok(|(_path, chunk_info)| chunk_info);
-        self.write_manifest_from_iterator(updated_chunks).await
-    }
-
-    async fn write_manifest_from_iterator(
-        &mut self,
-        chunks: impl Stream<Item = SessionResult<ChunkInfo>>,
-    ) -> SessionResult<Option<ManifestRef>> {
-        let mut from = vec![];
-        let mut to = vec![];
-        let chunks = aggregate_extents(&mut from, &mut to, chunks, |ci| &ci.coord);
-
-        if let Some(new_manifest) = Manifest::from_stream(chunks)
-            .await
-            .map_err(|e| SessionErrorKind::ManifestCreationError(Box::new(e)))?
-        {
-            let new_manifest = Arc::new(new_manifest);
-            let new_manifest_size =
-                self.asset_manager.write_manifest(Arc::clone(&new_manifest)).await?;
-
-            let file_info =
-                ManifestFileInfo::new(new_manifest.as_ref(), new_manifest_size);
-            self.manifest_files.insert(file_info);
-
-            let new_ref = ManifestRef {
-                object_id: new_manifest.id().clone(),
-                extents: ManifestExtents::new(&from, &to),
-            };
-            Ok(Some(new_ref))
-        } else {
-            Ok(None)
-        }
-    }
-
     /// Write a manifest for a node that was created in this session
     /// It doesn't need to look at previous manifests because the node is new
     async fn write_manifest_for_new_node(
@@ -1657,23 +1645,38 @@ impl<'a> FlushProcess<'a> {
         let splits =
             self.splits.get(node_id).expect("getting split for node unexpectedly failed");
 
-        for extent in splits.iter() {
-            if self.change_set.array_manifest(node_id, extent).is_some() {
-                let chunks = stream::iter(
-                    self.change_set
-                        .new_array_chunk_iterator(node_id, node_path, extent.clone())
-                        .map(Ok),
-                );
-                #[allow(clippy::expect_used)]
-                let new_ref = self.write_manifest_from_iterator(chunks).await.expect(
-                    "logic bug. for a new node, we must always write the manifest",
-                );
-                // new_ref is None if there were no chunks in the iterator
-                if let Some(new_ref) = new_ref {
-                    self.manifest_refs.entry(node_id.clone()).or_default().push(new_ref);
+        let iterators = splits
+            .iter()
+            .filter_map(|extent| {
+                if self.change_set.array_manifest(node_id, extent).is_some() {
+                    let chunks = stream::iter(
+                        self.change_set
+                            .new_array_chunk_iterator(node_id, node_path, extent.clone())
+                            .map(Ok),
+                    );
+                    Some(chunks)
+                } else {
+                    None
                 }
-            }
+            })
+            .collect::<Vec<_>>();
+
+        let new_refs =
+            futures::future::join_all(iterators.into_iter().map(|chunks| async {
+                #[allow(clippy::expect_used)]
+                write_manifest_from_iterator(self.asset_manager.as_ref(), chunks)
+                    .await
+                    .expect(
+                        "logic bug. for a new node, we must always write the manifest",
+                    )
+            }))
+            .await;
+
+        for (new_ref, file_info) in new_refs.into_iter().flatten() {
+            self.manifest_refs.entry(node_id.clone()).or_default().push(new_ref);
+            self.manifest_files.insert(file_info);
         }
+
         Ok(())
     }
 
@@ -1689,9 +1692,6 @@ impl<'a> FlushProcess<'a> {
         #[allow(clippy::expect_used)]
         let splits =
             self.splits.get(&node.id).expect("splits should exist for this node.");
-        let mut refs =
-            HashMap::<ManifestExtents, Vec<ManifestRef>>::with_capacity(splits.len());
-
         let on_disk_extents =
             existing_manifests.iter().map(|m| m.extents.clone()).collect::<Vec<_>>();
 
@@ -1704,12 +1704,23 @@ impl<'a> FlushProcess<'a> {
         // must be a subset of ``splits`` (the splits set in the config)
         debug_assert!(modified_splits.is_subset(&splits.iter().collect::<HashSet<_>>()));
 
+        let mut tasks = Vec::new();
         for extent in splits.iter() {
             if modified_splits.contains(extent) {
-                // this split was modified in this session, rewrite it completely
-                self.write_manifest_for_updated_chunks(node, extent)
-                    .await?
-                    .map(|new_ref| refs.insert(extent.clone(), vec![new_ref]));
+                let updated_chunks = map_ok_second(
+                    updated_node_chunks_iterator(
+                        self.asset_manager.as_ref(),
+                        self.change_set,
+                        self.parent_id,
+                        node.clone(),
+                        extent.clone(),
+                    )
+                    .await,
+                );
+                tasks.push(write_manifest_from_iterator(
+                    self.asset_manager.as_ref(),
+                    updated_chunks,
+                ));
             } else {
                 // intersection of the current split with extents on disk
                 let on_disk_bbox = on_disk_extents
@@ -1726,7 +1737,10 @@ impl<'a> FlushProcess<'a> {
                         Overlap::Complete => {
                             debug_assert!(on_disk_bbox.is_some());
                             // Just propagate this ref again, no rewriting necessary
-                            refs.entry(extent.clone()).or_default().push(old_ref.clone());
+                            self.manifest_refs
+                                .entry(node.id.clone())
+                                .or_default()
+                                .push(old_ref.clone());
                             // OK to unwrap here since this manifest file must exist in the old snapshot
                             #[allow(clippy::expect_used)]
                             self.manifest_files.insert(
@@ -1737,12 +1751,20 @@ impl<'a> FlushProcess<'a> {
                             // the splits have changed, but no refs in this split have been written in this session
                             // same as `if` block above
                             debug_assert!(on_disk_bbox.is_some());
-                            if let Some(new_ref) = self
-                                .write_manifest_for_updated_chunks(node, extent)
-                                .await?
-                            {
-                                refs.entry(extent.clone()).or_default().push(new_ref);
-                            }
+                            let updated_chunks = map_ok_second(
+                                updated_node_chunks_iterator(
+                                    self.asset_manager.as_ref(),
+                                    self.change_set,
+                                    self.parent_id,
+                                    node.clone(),
+                                    extent.clone(),
+                                )
+                                .await,
+                            );
+                            tasks.push(write_manifest_from_iterator(
+                                self.asset_manager.as_ref(),
+                                updated_chunks,
+                            ));
                         }
                         Overlap::None => {
                             // Nothing to do
@@ -1752,12 +1774,15 @@ impl<'a> FlushProcess<'a> {
             }
         }
 
+        let new_refs = futures::future::join_all(tasks.into_iter()).await;
+
         // FIXME: Assert that bboxes in refs don't overlap
 
-        self.manifest_refs
-            .entry(node.id.clone())
-            .or_default()
-            .extend(refs.into_values().flatten());
+        for (new_ref, file_info) in new_refs.into_iter().flatten().flatten() {
+            self.manifest_refs.entry(node.id.clone()).or_default().push(new_ref);
+            self.manifest_files.insert(file_info);
+        }
+
         Ok(())
     }
 

From 4e6823197d63268f7447dd9f0743c72331411a36 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Sat, 21 Jun 2025 17:38:04 -0600
Subject: [PATCH 41/43] more benchmarks

---
 icechunk-python/benchmarks/conftest.py        | 10 ++--
 .../benchmarks/test_benchmark_writes.py       | 50 ++++++++++++++++++-
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/icechunk-python/benchmarks/conftest.py b/icechunk-python/benchmarks/conftest.py
index 2617b1c20..d45e2b300 100644
--- a/icechunk-python/benchmarks/conftest.py
+++ b/icechunk-python/benchmarks/conftest.py
@@ -65,11 +65,11 @@ def large_write_dataset(request) -> BenchmarkWriteDataset:
 
 @pytest.fixture(
     params=[
-        pytest.param(GB_8MB_CHUNKS, id="gb-8mb"),
-        pytest.param(GB_128MB_CHUNKS, id="gb-128mb"),
-        pytest.param(ERA5_SINGLE, id="era5-single"),
-        pytest.param(ERA5, id="era5-weatherbench"),
-        pytest.param(ERA5_ARCO, id="era5-arco"),
+        # pytest.param(GB_8MB_CHUNKS, id="gb-8mb"),
+        # pytest.param(GB_128MB_CHUNKS, id="gb-128mb"),
+        # pytest.param(ERA5_SINGLE, id="era5-single"),
+        # pytest.param(ERA5, id="era5-weatherbench"),
+        # pytest.param(ERA5_ARCO, id="era5-arco"),
         pytest.param(LARGE_MANIFEST_UNSHARDED, id="large-manifest-no-split"),
         pytest.param(
             LARGE_MANIFEST_SHARDED,
diff --git a/icechunk-python/benchmarks/test_benchmark_writes.py b/icechunk-python/benchmarks/test_benchmark_writes.py
index 8c82fa4c9..ba0046725 100644
--- a/icechunk-python/benchmarks/test_benchmark_writes.py
+++ b/icechunk-python/benchmarks/test_benchmark_writes.py
@@ -181,7 +181,7 @@ def write():
 
 
 @pytest.mark.benchmark(group="refs-write")
-def test_write_split_manifest_refs(benchmark, splitting, large_write_dataset) -> None:
+def test_write_split_manifest_refs_full_rewrite(benchmark, splitting, large_write_dataset) -> None:
     dataset = large_write_dataset
     config = repo_config_with(splitting=splitting)
     assert config is not None
@@ -219,3 +219,51 @@ def commit(session_from_setup):
         session_from_setup.commit("wrote refs")
 
     benchmark.pedantic(commit, setup=write_refs, iterations=1, rounds=10)
+
+
+@pytest.mark.benchmark(group="refs-write")
+def test_write_split_manifest_refs_append(benchmark, splitting, large_write_dataset) -> None:
+    dataset = large_write_dataset
+    config = repo_config_with(splitting=splitting)
+    assert config is not None
+    if hasattr(config.manifest, "splitting"):
+        assert config.manifest.splitting == splitting
+    repo = dataset.create(config=config)
+    session = repo.writable_session(branch="main")
+    store = session.store
+    group = zarr.open_group(store, zarr_format=3)
+    group.create_array(
+        "array",
+        shape=dataset.shape,
+        chunks=dataset.chunks,
+        dtype="int8",
+        fill_value=0,
+        compressors=None,
+    )
+    session.commit("initialize")
+
+    # yuck, but I'm abusing `rounds` to do a loop and time _only_ the commit.
+    global counter
+    counter = 0
+    rounds = 10
+    num_chunks = dataset.shape[0] // dataset.chunks[0]
+    batch_size = num_chunks // rounds
+
+    def write_refs() -> Session:
+        global counter
+        session = repo.writable_session(branch="main")
+        chunks = [
+            VirtualChunkSpec(
+                index=[i], location=f"s3://foo/bar/{i}.nc", offset=0, length=1
+            )
+            for i in range(counter * batch_size, counter * batch_size + batch_size)
+        ]
+        counter += 1
+        session.store.set_virtual_refs("array", chunks)
+        # (args, kwargs)
+        return ((session,), {})
+
+    def commit(session_from_setup):
+        session_from_setup.commit("wrote refs")
+
+    benchmark.pedantic(commit, setup=write_refs, iterations=1, rounds=rounds)

From d0c42becf67817a575000b6db31eeac7fdbeec88 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Mon, 23 Jun 2025 11:47:20 -0600
Subject: [PATCH 42/43] Revert "Parallel writes"

This reverts commit 3b522597aa9c4ceecc5ae8a3097ebf24064faa41.
---
 icechunk/src/session.rs | 189 +++++++++++++++++-----------------------
 1 file changed, 82 insertions(+), 107 deletions(-)

diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
index a7e8650c5..1e0777b04 100644
--- a/icechunk/src/session.rs
+++ b/icechunk/src/session.rs
@@ -12,9 +12,7 @@ use async_stream::try_stream;
 use bytes::Bytes;
 use chrono::{DateTime, Utc};
 use err_into::ErrorInto;
-use futures::{
-    FutureExt, Stream, StreamExt, TryStream, TryStreamExt, future::Either, stream,
-};
+use futures::{FutureExt, Stream, StreamExt, TryStreamExt, future::Either, stream};
 use itertools::{Itertools as _, enumerate, repeat_n};
 use regex::bytes::Regex;
 use serde::{Deserialize, Serialize};
@@ -1574,40 +1572,6 @@ pub fn construct_valid_byte_range(
     }
 }
 
-async fn write_manifest_from_iterator(
-    asset_manager: &AssetManager,
-    chunks: impl Stream<Item = SessionResult<ChunkInfo>>,
-) -> SessionResult<Option<(ManifestRef, ManifestFileInfo)>> {
-    let mut from = vec![];
-    let mut to = vec![];
-    let chunks = aggregate_extents(&mut from, &mut to, chunks, |ci| &ci.coord);
-
-    if let Some(new_manifest) = Manifest::from_stream(chunks)
-        .await
-        .map_err(|e| SessionErrorKind::ManifestCreationError(Box::new(e)))?
-    {
-        let new_manifest = Arc::new(new_manifest);
-        let new_manifest_size =
-            asset_manager.write_manifest(Arc::clone(&new_manifest)).await?;
-
-        let file_info = ManifestFileInfo::new(new_manifest.as_ref(), new_manifest_size);
-
-        let new_ref = ManifestRef {
-            object_id: new_manifest.id().clone(),
-            extents: ManifestExtents::new(&from, &to),
-        };
-        Ok(Some((new_ref, file_info)))
-    } else {
-        Ok(None)
-    }
-}
-
-fn map_ok_second<A, B, E>(
-    stream: impl TryStream<Item = Result<(A, B), E>, Ok = (A, B), Error = E>,
-) -> impl TryStream<Item = Result<B, E>, Ok = B, Error = E> {
-    stream.map_ok(|(_, second)| second)
-}
-
 struct FlushProcess<'a> {
     asset_manager: Arc<AssetManager>,
     change_set: &'a ChangeSet,
@@ -1634,6 +1598,54 @@ impl<'a> FlushProcess<'a> {
         }
     }
 
+    async fn write_manifest_for_updated_chunks(
+        &mut self,
+        node: &NodeSnapshot,
+        extent: &ManifestExtents,
+    ) -> SessionResult<Option<ManifestRef>> {
+        let asset_manager = Arc::clone(&self.asset_manager);
+        let updated_chunks = updated_node_chunks_iterator(
+            asset_manager.as_ref(),
+            self.change_set,
+            self.parent_id,
+            node.clone(),
+            extent.clone(),
+        )
+        .await
+        .map_ok(|(_path, chunk_info)| chunk_info);
+        self.write_manifest_from_iterator(updated_chunks).await
+    }
+
+    async fn write_manifest_from_iterator(
+        &mut self,
+        chunks: impl Stream<Item = SessionResult<ChunkInfo>>,
+    ) -> SessionResult<Option<ManifestRef>> {
+        let mut from = vec![];
+        let mut to = vec![];
+        let chunks = aggregate_extents(&mut from, &mut to, chunks, |ci| &ci.coord);
+
+        if let Some(new_manifest) = Manifest::from_stream(chunks)
+            .await
+            .map_err(|e| SessionErrorKind::ManifestCreationError(Box::new(e)))?
+        {
+            let new_manifest = Arc::new(new_manifest);
+            let new_manifest_size =
+                self.asset_manager.write_manifest(Arc::clone(&new_manifest)).await?;
+
+            let file_info =
+                ManifestFileInfo::new(new_manifest.as_ref(), new_manifest_size);
+            self.manifest_files.insert(file_info);
+
+            let new_ref = ManifestRef {
+                object_id: new_manifest.id().clone(),
+                extents: ManifestExtents::new(&from, &to),
+            };
+            Ok(Some(new_ref))
+        } else {
+            Ok(None)
+        }
+    }
+
     /// Write a manifest for a node that was created in this session
     /// It doesn't need to look at previous manifests because the node is new
     async fn write_manifest_for_new_node(
@@ -1645,38 +1657,23 @@ impl<'a> FlushProcess<'a> {
         let splits =
             self.splits.get(node_id).expect("getting split for node unexpectedly failed");
 
-        let iterators = splits
-            .iter()
-            .filter_map(|extent| {
-                if self.change_set.array_manifest(node_id, extent).is_some() {
-                    let chunks = stream::iter(
-                        self.change_set
-                            .new_array_chunk_iterator(node_id, node_path, extent.clone())
-                            .map(Ok),
-                    );
-                    Some(chunks)
-                } else {
-                    None
-                }
-            })
-            .collect::<Vec<_>>();
-
-        let new_refs =
-            futures::future::join_all(iterators.into_iter().map(|chunks| async {
+        for extent in splits.iter() {
+            if self.change_set.array_manifest(node_id, extent).is_some() {
+                let chunks = stream::iter(
+                    self.change_set
+                        .new_array_chunk_iterator(node_id, node_path, extent.clone())
+                        .map(Ok),
+                );
                 #[allow(clippy::expect_used)]
-                write_manifest_from_iterator(self.asset_manager.as_ref(), chunks)
-                    .await
-                    .expect(
-                        "logic bug. for a new node, we must always write the manifest",
-                    )
-            }))
-            .await;
-
-        for (new_ref, file_info) in new_refs.into_iter().flatten() {
-            self.manifest_refs.entry(node_id.clone()).or_default().push(new_ref);
-            self.manifest_files.insert(file_info);
+                let new_ref = self.write_manifest_from_iterator(chunks).await.expect(
+                    "logic bug. for a new node, we must always write the manifest",
+                );
+                // new_ref is None if there were no chunks in the iterator
+                if let Some(new_ref) = new_ref {
+                    self.manifest_refs.entry(node_id.clone()).or_default().push(new_ref);
+                }
+            }
         }
-
         Ok(())
     }
 
@@ -1692,6 +1689,9 @@ impl<'a> FlushProcess<'a> {
         #[allow(clippy::expect_used)]
         let splits =
             self.splits.get(&node.id).expect("splits should exist for this node.");
+        let mut refs =
+            HashMap::<ManifestExtents, Vec<ManifestRef>>::with_capacity(splits.len());
+
         let on_disk_extents =
             existing_manifests.iter().map(|m| m.extents.clone()).collect::<Vec<_>>();
 
@@ -1704,23 +1704,12 @@ impl<'a> FlushProcess<'a> {
         // must be a subset of ``splits`` (the splits set in the config)
         debug_assert!(modified_splits.is_subset(&splits.iter().collect::<HashSet<_>>()));
 
-        let mut tasks = Vec::new();
         for extent in splits.iter() {
             if modified_splits.contains(extent) {
-                let updated_chunks = map_ok_second(
-                    updated_node_chunks_iterator(
-                        self.asset_manager.as_ref(),
-                        self.change_set,
-                        self.parent_id,
-                        node.clone(),
-                        extent.clone(),
-                    )
-                    .await,
-                );
-                tasks.push(write_manifest_from_iterator(
-                    self.asset_manager.as_ref(),
-                    updated_chunks,
-                ));
+                // this split was modified in this session, rewrite it completely
+                self.write_manifest_for_updated_chunks(node, extent)
+                    .await?
+                    .map(|new_ref| refs.insert(extent.clone(), vec![new_ref]));
             } else {
                 // intersection of the current split with extents on disk
                 let on_disk_bbox = on_disk_extents
@@ -1737,10 +1726,7 @@ impl<'a> FlushProcess<'a> {
                         Overlap::Complete => {
                             debug_assert!(on_disk_bbox.is_some());
                             // Just propagate this ref again, no rewriting necessary
-                            self.manifest_refs
-                                .entry(node.id.clone())
-                                .or_default()
-                                .push(old_ref.clone());
+                            refs.entry(extent.clone()).or_default().push(old_ref.clone());
                             // OK to unwrap here since this manifest file must exist in the old snapshot
                             #[allow(clippy::expect_used)]
                             self.manifest_files.insert(
@@ -1751,20 +1737,12 @@ impl<'a> FlushProcess<'a> {
                             // the splits have changed, but no refs in this split have been written in this session
                             // same as `if` block above
                             debug_assert!(on_disk_bbox.is_some());
-                            let updated_chunks = map_ok_second(
-                                updated_node_chunks_iterator(
-                                    self.asset_manager.as_ref(),
-                                    self.change_set,
-                                    self.parent_id,
-                                    node.clone(),
-                                    extent.clone(),
-                                )
-                                .await,
-                            );
-                            tasks.push(write_manifest_from_iterator(
-                                self.asset_manager.as_ref(),
-                                updated_chunks,
-                            ));
+                            if let Some(new_ref) = self
+                                .write_manifest_for_updated_chunks(node, extent)
+                                .await?
+                            {
+                                refs.entry(extent.clone()).or_default().push(new_ref);
+                            }
                         }
                         Overlap::None => {
                             // Nothing to do
@@ -1774,15 +1752,12 @@ impl<'a> FlushProcess<'a> {
             }
         }
 
-        let new_refs = futures::future::join_all(tasks.into_iter()).await;
-
         // FIXME: Assert that bboxes in refs don't overlap
 
-        for (new_ref, file_info) in new_refs.into_iter().flatten().flatten() {
-            self.manifest_refs.entry(node.id.clone()).or_default().push(new_ref);
-            self.manifest_files.insert(file_info);
-        }
-
+        self.manifest_refs
+            .entry(node.id.clone())
+            .or_default()
+            .extend(refs.into_values().flatten());
         Ok(())
     }
 

From fcdc2d686834279fe587b40231d80b1891b98c43 Mon Sep 17 00:00:00 2001
From: Deepak Cherian <deepak@earthmover.io>
Date: Mon, 23 Jun 2025 11:59:08 -0600
Subject: [PATCH 43/43] lint

---
 icechunk-python/benchmarks/test_benchmark_writes.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/icechunk-python/benchmarks/test_benchmark_writes.py b/icechunk-python/benchmarks/test_benchmark_writes.py
index ba0046725..efe77e8cc 100644
--- a/icechunk-python/benchmarks/test_benchmark_writes.py
+++ b/icechunk-python/benchmarks/test_benchmark_writes.py
@@ -181,7 +181,9 @@ def write():
 
 
 @pytest.mark.benchmark(group="refs-write")
-def test_write_split_manifest_refs_full_rewrite(benchmark, splitting, large_write_dataset) -> None:
+def test_write_split_manifest_refs_full_rewrite(
+    benchmark, splitting, large_write_dataset
+) -> None:
     dataset = large_write_dataset
     config = repo_config_with(splitting=splitting)
     assert config is not None
@@ -222,7 +224,9 @@ def commit(session_from_setup):
 
 
 @pytest.mark.benchmark(group="refs-write")
-def test_write_split_manifest_refs_append(benchmark, splitting, large_write_dataset) -> None:
+def test_write_split_manifest_refs_append(
+    benchmark, splitting, large_write_dataset
+) -> None:
     dataset = large_write_dataset
     config = repo_config_with(splitting=splitting)
     assert config is not None