From 4b30b9f25aa1349ace708227c28b5e3dc6ec56f9 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 12 May 2025 21:01:03 -0600 Subject: [PATCH 01/43] [WIP] Optimize split manifest writes. --- icechunk/src/change_set.rs | 236 ++++++++-- icechunk/src/conflicts/detector.rs | 1 - icechunk/src/format/manifest.rs | 48 +- icechunk/src/format/transaction_log.rs | 1 - icechunk/src/repository.rs | 26 +- icechunk/src/session.rs | 622 ++++++++++++++++++------- icechunk/src/strategies.rs | 14 +- 7 files changed, 708 insertions(+), 240 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index 8a5f880be..d42830b98 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -1,7 +1,6 @@ use std::{ collections::{BTreeMap, HashMap, HashSet}, iter, - mem::take, }; use bytes::Bytes; @@ -9,9 +8,10 @@ use itertools::{Either, Itertools as _}; use serde::{Deserialize, Serialize}; use crate::{ + config::ManifestSplittingConfig, format::{ ChunkIndices, NodeId, Path, - manifest::{ChunkInfo, ChunkPayload}, + manifest::{ChunkInfo, ChunkPayload, ManifestExtents, ManifestSplits}, snapshot::{ArrayShape, DimensionName, NodeData, NodeSnapshot}, }, session::SessionResult, @@ -24,20 +24,89 @@ pub struct ArrayData { pub user_data: Bytes, } +impl ManifestSplits { + pub fn which_extent(&self, coord: &ChunkIndices) -> SessionResult<&ManifestExtents> { + Ok(self.0.get(self.which(coord)?).expect(&format!( + "logic bug, could not find ManifestExtents for this coordinate: {:?}", + coord + ))) + } +} + +#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)] +pub struct SplitManifest { + from: Vec, + to: Vec, + // It's important we keep these sorted, we use this fact in TransactionLog creation + chunks: BTreeMap>, +} + +impl SplitManifest { + pub fn update(&mut self, coord: ChunkIndices, data: Option) { + if self.from.is_empty() { + debug_assert!(self.to.is_empty()); + debug_assert!(self.chunks.is_empty()); + // important to remember that `to` is not inclusive, so we need +1 + let mut coord0 = coord.0.clone(); + self.to.extend(coord0.iter().map(|n| *n + 1)); + self.from.append(&mut coord0); + } else { + for (existing, coord0) in self.from.iter_mut().zip(coord.0.iter()) { + if coord0 < existing { + *existing = *coord0 + } + } + for (existing, coord0) in self.to.iter_mut().zip(coord.0.iter()) { + // important to remember that `to` is not inclusive, so we need +1 + let range_value = coord0 + 1; + if range_value > *existing { + *existing = range_value + } + } + } + self.chunks.insert(coord, data); + } + + pub fn retain(&mut self, predicate: impl Fn(&ChunkIndices) -> bool) { + self.chunks.retain(|coord, _| { + if !predicate(coord) { + // FIXME: handle from, to updating + todo!(); + } else { + false + } + }) + } + + pub fn extents(&self) -> ManifestExtents { + ManifestExtents::new(self.from.as_slice(), self.to.as_slice()) + } +} + #[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)] pub struct ChangeSet { + // splitting configuration is recorded at the time the writable session is created + // we ignore any succeeding changes in repository config. + splitting: ManifestSplittingConfig, + // This is an optimization so that we needn't figure out the split sizes on every set. + // TODO: consider merging with `set_chunks` BTreeMap + splits: HashMap, new_groups: HashMap, new_arrays: HashMap, updated_arrays: HashMap, updated_groups: HashMap, - // It's important we keep these sorted, we use this fact in TransactionLog creation - // TODO: could track ManifestExtents - set_chunks: BTreeMap>>, + // FIXME: It's important we keep these sorted, we use this fact in TransactionLog creation + // Change HashMap -> BTreeMap, need to check Ord on ManifestExtents + set_chunks: BTreeMap>, deleted_groups: HashSet<(Path, NodeId)>, deleted_arrays: HashSet<(Path, NodeId)>, } impl ChangeSet { + pub fn new(splitting: ManifestSplittingConfig) -> Self { + Self { splitting, ..Default::default() } + } + pub fn deleted_arrays(&self) -> impl Iterator { self.deleted_arrays.iter() } @@ -58,11 +127,16 @@ impl ChangeSet { self.deleted_arrays.contains(path_and_id) } + pub fn splits(&self, id: &NodeId) -> Option<&ManifestSplits> { + self.splits.get(id) + } + pub fn chunk_changes( &self, - ) -> impl Iterator>)> - { - self.set_chunks.iter() + ) -> impl Iterator)> { + self.set_chunks.iter().map(|(node_id, split_map)| { + (node_id, split_map.values().flat_map(|x| x.chunks.keys())) + }) } pub fn has_chunk_changes(&self, node: &NodeId) -> bool { @@ -70,7 +144,7 @@ impl ChangeSet { } pub fn arrays_with_chunk_changes(&self) -> impl Iterator { - self.chunk_changes().map(|(node, _)| node) + self.set_chunks.iter().map(|(node, _)| node) } pub fn is_empty(&self) -> bool { @@ -100,11 +174,39 @@ impl ChangeSet { } } + fn maybe_update_cached_splits( + &mut self, + node_id: &NodeId, + path: &Path, + shape: &ArrayShape, + dimension_names: &Option>, + ) { + if !self.splits.contains_key(node_id) { + // Q: What happens if we set a chunk, then change a dimension name, so + // that the split changes. + // A: We ignore it. splits are set once for a node in a session, and are never changed. + let splits = self.splitting.get_split_sizes(path, shape, dimension_names); + self.splits.insert(node_id.clone(), splits); + } + } + pub fn add_array(&mut self, path: Path, node_id: NodeId, array_data: ArrayData) { + self.maybe_update_cached_splits( + &node_id, + &path, + &array_data.shape, + &array_data.dimension_names, + ); self.new_arrays.insert(path, (node_id, array_data)); } pub fn update_array(&mut self, node_id: &NodeId, path: &Path, array_data: ArrayData) { + self.maybe_update_cached_splits( + &node_id, + &path, + &array_data.shape, + &array_data.dimension_names, + ); match self.new_arrays.get(path) { Some((id, _)) => { debug_assert!(!self.updated_arrays.contains_key(id)); @@ -139,6 +241,7 @@ impl ChangeSet { self.updated_arrays.remove(node_id); self.set_chunks.remove(node_id); + self.splits.remove(node_id); if !is_new_array { self.deleted_arrays.insert((path, node_id.clone())); } @@ -167,14 +270,30 @@ impl ChangeSet { coord: ChunkIndices, data: Option, ) { + let cached_splits = self.splits.get(&node_id).expect(&format!( + "logic bug. change_set.splits should be populated for node {}", + node_id + )); + + let extent = cached_splits.which_extent(&coord).expect("logic bug. Trying to set chunk ref but can't find the appropriate split manifest."); // this implementation makes delete idempotent // it allows deleting a deleted chunk by repeatedly setting None. self.set_chunks .entry(node_id) .and_modify(|h| { - h.insert(coord.clone(), data.clone()); + h.entry(extent.clone()).or_default().update(coord.clone(), data.clone()); }) - .or_insert(BTreeMap::from([(coord, data)])); + .or_insert_with(|| { + let mut h = HashMap::::with_capacity( + cached_splits.len(), + ); + h.entry(extent.clone()) + // TODO: this is duplicative. I can't use `or_default` because it's + // nice to create the HashMap using `with_capacity` + .or_default() + .update(coord, data); + h + }); } pub fn get_chunk_ref( @@ -182,7 +301,17 @@ impl ChangeSet { node_id: &NodeId, coords: &ChunkIndices, ) -> Option<&Option> { - self.set_chunks.get(node_id).and_then(|h| h.get(coords)) + self.splits + .get(node_id) + .and_then(|splits| { + splits.which_extent(coords).ok().map(|extent| { + self.set_chunks + .get(node_id) + .and_then(|h| h.get(&extent)) + .and_then(|s| s.chunks.get(coords)) + }) + }) + .flatten() } /// Drop the updated chunk references for the node. @@ -190,10 +319,12 @@ impl ChangeSet { pub fn drop_chunk_changes( &mut self, node_id: &NodeId, - predicate: impl Fn(&ChunkIndices) -> bool, + predicate: impl Fn(&ChunkIndices) -> bool + Copy, ) { if let Some(changes) = self.set_chunks.get_mut(node_id) { - changes.retain(|coord, _| !predicate(coord)); + for split in changes.values_mut() { + split.retain(predicate); + } } } @@ -201,13 +332,21 @@ impl ChangeSet { &self, node_id: &NodeId, node_path: &Path, + extent: Option, ) -> impl Iterator)> + use<'_> { if self.is_deleted(node_path, node_id) { return Either::Left(iter::empty()); } match self.set_chunks.get(node_id) { None => Either::Left(iter::empty()), - Some(h) => Either::Right(h.iter()), + Some(h) => Either::Right( + h.iter() + // FIXME: review this + .filter(move |(manifest_extent, _)| { + extent.is_none() || Some(*manifest_extent) == extent.as_ref() + }) + .flat_map(|(_, manifest)| manifest.chunks.iter()), + ), } } @@ -224,7 +363,7 @@ impl ChangeSet { node_id: &'a NodeId, node_path: &Path, ) -> impl Iterator + use<'a> { - self.array_chunks_iterator(node_id, node_path).filter_map( + self.array_chunks_iterator(node_id, node_path, None).filter_map( move |(coords, payload)| { payload.as_ref().map(|p| ChunkInfo { node: node_id.clone(), @@ -235,6 +374,47 @@ impl ChangeSet { ) } + pub fn array_manifests_iterator( + &self, + node_id: &NodeId, + node_path: &Path, + ) -> impl Iterator + use<'_> { + if self.is_deleted(node_path, node_id) { + return Either::Left(iter::empty()); + } + match self.set_chunks.get(node_id) { + None => Either::Left(iter::empty()), + Some(h) => Either::Right(h.iter()), + } + } + + pub fn array_manifest( + &self, + node_id: &NodeId, + extent: &ManifestExtents, + ) -> Option<&SplitManifest> { + self.set_chunks.get(node_id).and_then(|x| x.get(extent)) + } + + /// Iterator over chunks for a new array for a given ManifestExtents + pub fn new_array_manifest_chunks_iterator<'a>( + &'a self, + node_id: &'a NodeId, + extent: &ManifestExtents, + ) -> impl Iterator + use<'a> { + if let Some(manifest) = self.array_manifest(node_id, extent) { + Either::Right(manifest.chunks.iter().filter_map(move |(coords, payload)| { + payload.as_ref().map(|p| ChunkInfo { + node: node_id.clone(), + coord: coords.clone(), + payload: p.clone(), + }) + })) + } else { + Either::Left(iter::empty()) + } + } + pub fn new_nodes(&self) -> impl Iterator { self.new_groups().chain(self.new_arrays()) } @@ -247,24 +427,25 @@ impl ChangeSet { self.new_arrays.iter().map(|(path, (node_id, _))| (path, node_id)) } - pub fn take_chunks( - &mut self, - ) -> BTreeMap>> { - take(&mut self.set_chunks) - } + // pub fn take_chunks( + // &mut self, + // ) -> BTreeMap>> { + // take(&mut self.set_chunks) + // } - pub fn set_chunks( - &mut self, - chunks: BTreeMap>>, - ) { - self.set_chunks = chunks - } + // pub fn set_chunks( + // &mut self, + // chunks: BTreeMap>>, + // ) { + // self.set_chunks = chunks + // } /// Merge this ChangeSet with `other`. /// /// Results of the merge are applied to `self`. Changes present in `other` take precedence over /// `self` changes. pub fn merge(&mut self, other: ChangeSet) { + // FIXME: what do I do with splitting and splits here. // FIXME: this should detect conflict, for example, if different writers added on the same // path, different objects, or if the same path is added and deleted, etc. // TODO: optimize @@ -274,6 +455,7 @@ impl ChangeSet { self.updated_arrays.extend(other.updated_arrays); self.deleted_groups.extend(other.deleted_groups); self.deleted_arrays.extend(other.deleted_arrays); + // FIXME: handle splits for (node, other_chunks) in other.set_chunks.into_iter() { match self.set_chunks.remove(&node) { diff --git a/icechunk/src/conflicts/detector.rs b/icechunk/src/conflicts/detector.rs index 3de442e62..d2c4cef24 100644 --- a/icechunk/src/conflicts/detector.rs +++ b/icechunk/src/conflicts/detector.rs @@ -145,7 +145,6 @@ impl ConflictSolver for ConflictDetector { None } else { let conflicting: HashSet<_> = changes - .keys() .filter(|coord| previous_changes.contains(coord)) .cloned() .collect(); diff --git a/icechunk/src/format/manifest.rs b/icechunk/src/format/manifest.rs index ea03d32d3..f9e89009d 100644 --- a/icechunk/src/format/manifest.rs +++ b/icechunk/src/format/manifest.rs @@ -1,10 +1,16 @@ -use std::{borrow::Cow, convert::Infallible, ops::Range, sync::Arc}; +use std::{ + borrow::Cow, + cmp::{max, min}, + convert::Infallible, + ops::Range, + sync::Arc, +}; use crate::format::flatbuffers::generated; use bytes::Bytes; use flatbuffers::VerifierOptions; use futures::{Stream, TryStreamExt}; -use itertools::{Itertools, multiunzip}; +use itertools::{Itertools, any, multiunzip}; use serde::{Deserialize, Serialize}; use thiserror::Error; @@ -18,15 +24,9 @@ use super::{ ChunkId, ChunkIndices, ChunkLength, ChunkOffset, IcechunkResult, ManifestId, NodeId, }; -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, Hash, PartialEq, Eq, Serialize, Deserialize)] pub struct ManifestExtents(Vec>); -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct ManifestRef { - pub object_id: ManifestId, - pub extents: ManifestExtents, -} - impl ManifestExtents { pub fn new(from: &[u32], to: &[u32]) -> Self { let v = from @@ -37,6 +37,10 @@ impl ManifestExtents { Self(v) } + pub fn from_ranges_iter(ranges: impl IntoIterator>) -> Self { + Self(ranges.into_iter().collect()) + } + pub fn contains(&self, coord: &[u32]) -> bool { self.iter().zip(coord.iter()).all(|(range, that)| range.contains(that)) } @@ -52,10 +56,32 @@ impl ManifestExtents { pub fn is_empty(&self) -> bool { self.0.is_empty() } + + pub fn intersection(&self, other: &Self) -> Option { + debug_assert_eq!(self.len(), other.len()); + let ranges = std::iter::zip(self.iter(), other.iter()) + .map(|(a, b)| max(a.start, b.start)..min(a.end, b.end)) + .collect::>(); + if any(ranges.iter(), |r| r.end < r.start) { None } else { Some(Self(ranges)) } + } + + pub fn union(&self, other: &Self) -> Self { + debug_assert_eq!(self.len(), other.len()); + Self::from_ranges_iter( + std::iter::zip(self.iter(), other.iter()) + .map(|(a, b)| min(a.start, b.start)..max(a.end, b.end)), + ) + } } -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -pub struct ManifestSplits(Vec); +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ManifestRef { + pub object_id: ManifestId, + pub extents: ManifestExtents, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ManifestSplits(pub Vec); impl ManifestSplits { /// Used at read-time diff --git a/icechunk/src/format/transaction_log.rs b/icechunk/src/format/transaction_log.rs index ee064b66f..06324b833 100644 --- a/icechunk/src/format/transaction_log.rs +++ b/icechunk/src/format/transaction_log.rs @@ -47,7 +47,6 @@ impl TransactionLog { let node_id = generated::ObjectId8::new(&node_id.0); let node_id = Some(&node_id); let chunks = chunks - .keys() .map(|indices| { let coords = Some(builder.create_vector(indices.0.as_slice())); generated::ChunkIndices::create( diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs index d7652a815..54668c50c 100644 --- a/icechunk/src/repository.rs +++ b/icechunk/src/repository.rs @@ -1380,24 +1380,15 @@ mod tests { ), ]; let split_config = ManifestSplittingConfig { split_sizes: Some(split_sizes) }; - let repo = create_repo_with_split_manifest_config( - &temp_path, - &shape, - &dimension_names, - &split_config, - None, - ) - .await?; - let session = repo.writable_session("main").await?; - let actual = - split_config.get_split_sizes(&session.get_node(&temp_path).await?)?; let expected = ManifestSplits::from_edges(vec![ vec![0, 12, 24, 25], vec![0, 9, 10], vec![0, 2, 3], vec![0, 4], ]); + + let actual = split_config.get_split_sizes(&temp_path, &shape, &dimension_names); assert_eq!(actual, expected); let split_sizes = vec![( @@ -1418,18 +1409,7 @@ mod tests { ], )]; let split_config = ManifestSplittingConfig { split_sizes: Some(split_sizes) }; - let repo = create_repo_with_split_manifest_config( - &temp_path, - &shape, - &dimension_names, - &split_config, - None, - ) - .await?; - - let session = repo.writable_session("main").await?; - let actual = - split_config.get_split_sizes(&session.get_node(&temp_path).await?)?; + let actual = split_config.get_split_sizes(&temp_path, &shape, &dimension_names); assert_eq!(actual, expected); Ok(()) diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 8a4c257b2..8175242a4 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -3,6 +3,7 @@ use std::{ collections::{HashMap, HashSet}, convert::Infallible, future::{Future, ready}, + iter::zip, ops::Range, pin::Pin, sync::Arc, @@ -188,6 +189,40 @@ impl ManifestSplits { } } +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Overlap { + Complete, + Partial, + None, +} + +/// Important: this is not symmetric. +pub fn overlaps(us: &ManifestExtents, them: &ManifestExtents) -> Overlap { + debug_assert!(us.len() == them.len()); + + let mut overlaps = vec![]; + for (a, b) in zip(us.iter(), them.iter()) { + debug_assert!(a.start <= a.end, "Invalid range: {:?}", a.clone()); + debug_assert!(b.start <= b.end, "Invalid range: {:?}", b.clone()); + + if (a.start <= b.start) && (a.end >= b.end) { + overlaps.push(Overlap::Complete); + } else if (a.end <= b.start) || (a.start >= b.end) { + overlaps.push(Overlap::None); + } else { + overlaps.push(Overlap::Partial) + } + } + + if overlaps.iter().all(|x| x == &Overlap::Complete) { + return Overlap::Complete; + } else if overlaps.iter().any(|x| x == &Overlap::None) { + return Overlap::None; + } else { + return Overlap::Partial; + } +} + #[derive(Clone, Debug, Serialize, Deserialize)] pub struct Session { config: RepositoryConfig, @@ -234,6 +269,7 @@ impl Session { snapshot_id: SnapshotId, default_commit_metadata: SnapshotProperties, ) -> Self { + let splitting = config.manifest().splitting().clone(); Self { config, storage_settings: Arc::new(storage_settings), @@ -242,7 +278,7 @@ impl Session { virtual_resolver, branch_name: Some(branch_name), snapshot_id, - change_set: ChangeSet::default(), + change_set: ChangeSet::new(splitting), default_commit_metadata, } } @@ -560,6 +596,7 @@ impl Session { &self.change_set, &self.snapshot_id, path, + None, ) .await } @@ -800,6 +837,7 @@ impl Session { &self.change_set, &self.snapshot_id, node.clone(), + None, ) .await .map_ok(|(_path, chunk_info)| chunk_info.coord); @@ -1112,8 +1150,14 @@ async fn updated_chunk_iterator<'a>( let snapshot = asset_manager.fetch_snapshot(snapshot_id).await?; let nodes = futures::stream::iter(snapshot.iter_arc()); let res = nodes.and_then(move |node| async move { - Ok(updated_node_chunks_iterator(asset_manager, change_set, snapshot_id, node) - .await) + Ok(updated_node_chunks_iterator( + asset_manager, + change_set, + snapshot_id, + node, + None, + ) + .await) }); Ok(res.try_flatten()) } @@ -1123,6 +1167,7 @@ async fn updated_node_chunks_iterator<'a>( change_set: &'a ChangeSet, snapshot_id: &'a SnapshotId, node: NodeSnapshot, + extent: Option, ) -> impl Stream> + 'a { // This iterator should yield chunks for existing arrays + any updates. // we check for deletion here in the case that `path` exists in the snapshot, @@ -1133,9 +1178,15 @@ async fn updated_node_chunks_iterator<'a>( let path = node.path.clone(); Either::Right( // TODO: avoid clone - verified_node_chunk_iterator(asset_manager, snapshot_id, change_set, node) - .await - .map_ok(move |ci| (path.clone(), ci)), + verified_node_chunk_iterator( + asset_manager, + snapshot_id, + change_set, + node, + extent, + ) + .await + .map_ok(move |ci| (path.clone(), ci)), ) } } @@ -1146,11 +1197,18 @@ async fn node_chunk_iterator<'a>( change_set: &'a ChangeSet, snapshot_id: &'a SnapshotId, path: &Path, + extent: Option, ) -> impl Stream> + 'a + use<'a> { match get_node(asset_manager, change_set, snapshot_id, path).await { Ok(node) => futures::future::Either::Left( - verified_node_chunk_iterator(asset_manager, snapshot_id, change_set, node) - .await, + verified_node_chunk_iterator( + asset_manager, + snapshot_id, + change_set, + node, + extent, + ) + .await, ), Err(_) => futures::future::Either::Right(futures::stream::empty()), } @@ -1162,20 +1220,22 @@ async fn verified_node_chunk_iterator<'a>( snapshot_id: &'a SnapshotId, change_set: &'a ChangeSet, node: NodeSnapshot, + extent: Option, ) -> impl Stream> + 'a { match node.node_data { NodeData::Group => futures::future::Either::Left(futures::stream::empty()), NodeData::Array { manifests, .. } => { let new_chunk_indices: Box> = Box::new( change_set - .array_chunks_iterator(&node.id, &node.path) + .array_chunks_iterator(&node.id, &node.path, extent.clone()) .map(|(idx, _)| idx) .collect(), ); let node_id_c = node.id.clone(); + let extent_c = extent.clone(); let new_chunks = change_set - .array_chunks_iterator(&node.id, &node.path) + .array_chunks_iterator(&node.id, &node.path, extent.clone()) .filter_map(move |(idx, payload)| { payload.as_ref().map(|payload| { Ok(ChunkInfo { @@ -1189,11 +1249,17 @@ async fn verified_node_chunk_iterator<'a>( futures::future::Either::Right( futures::stream::iter(new_chunks).chain( futures::stream::iter(manifests) + .filter(move |manifest_ref| { + futures::future::ready(extent.as_ref().is_none_or(|e| { + overlaps(&manifest_ref.extents, &e) != Overlap::None + })) + }) .then(move |manifest_ref| { let new_chunk_indices = new_chunk_indices.clone(); let node_id_c = node.id.clone(); let node_id_c2 = node.id.clone(); let node_id_c3 = node.id.clone(); + let extent_c2 = extent_c.clone(); async move { let manifest = fetch_manifest( &manifest_ref.object_id, @@ -1207,6 +1273,13 @@ async fn verified_node_chunk_iterator<'a>( .iter(node_id_c.clone()) .filter_ok(move |(coord, _)| { !new_chunk_indices.contains(coord) + // If the manifest we are parsing partially overlaps with `extent`, + // we need to filter all coordinates + && extent_c2.as_ref().is_none_or( + move |e| { + e.contains(coord.0.as_slice()) + }, + ) }) .map_ok(move |(coord, payload)| ChunkInfo { node: node_id_c2.clone(), @@ -1214,6 +1287,7 @@ async fn verified_node_chunk_iterator<'a>( payload, }); + // FIXME: I don't understand this let old_chunks = change_set .update_existing_chunks( node_id_c3, old_chunks, @@ -1444,41 +1518,6 @@ pub fn construct_valid_byte_range( } } -#[derive(Default, Debug)] -struct SplitManifest { - from: Vec, - to: Vec, - chunks: Vec, -} - -impl SplitManifest { - fn update(&mut self, chunk: ChunkInfo) { - if self.from.is_empty() { - debug_assert!(self.to.is_empty()); - debug_assert!(self.chunks.is_empty()); - // important to remember that `to` is not inclusive, so we need +1 - let mut coord = chunk.coord.0.clone(); - self.to.extend(coord.iter().map(|n| *n + 1)); - self.from.append(&mut coord); - } else { - for (existing, coord) in self.from.iter_mut().zip(chunk.coord.0.iter()) { - if coord < existing { - *existing = *coord - } - } - for (existing, coord) in self.to.iter_mut().zip(chunk.coord.0.iter()) { - // important to remember that `to` is not inclusive, so we need +1 - let range_value = coord + 1; - if range_value > *existing { - *existing = range_value - } - } - } - - self.chunks.push(chunk) - } -} - struct FlushProcess<'a> { asset_manager: Arc, change_set: &'a ChangeSet, @@ -1505,69 +1544,91 @@ impl<'a> FlushProcess<'a> { } } - async fn write_manifests_from_iterator( + async fn write_manifest_for_updated_chunks( &mut self, - node_id: &NodeId, - chunks: impl Stream>, - splits: ManifestSplits, - ) -> SessionResult<()> { - // TODO: think about optimizing writes to manifests - // TODO: add benchmarks - let split_refs = chunks - .try_fold( - // TODO: have the changeset track this HashMap - HashMap::::with_capacity(splits.len()), - |mut split_refs, chunk| async { - let split_index = splits.which(&chunk.coord); - split_index.map(|index| { - split_refs.entry(index).or_default().update(chunk); - split_refs - }) - }, - ) - .await?; - - for (_, shard) in split_refs.into_iter() { - let shard_chunks = - stream::iter(shard.chunks.into_iter().map(Ok::)); - - if let Some(new_manifest) = Manifest::from_stream(shard_chunks).await.unwrap() - { - let new_manifest = Arc::new(new_manifest); - let new_manifest_size = - self.asset_manager.write_manifest(Arc::clone(&new_manifest)).await?; - - let file_info = - ManifestFileInfo::new(new_manifest.as_ref(), new_manifest_size); - self.manifest_files.insert(file_info); - - let new_ref = ManifestRef { - object_id: new_manifest.id().clone(), - extents: ManifestExtents::new(&shard.from, &shard.to), - }; + node: &NodeSnapshot, + extent: &ManifestExtents, + actual_extents: ManifestExtents, + ) -> SessionResult> { + let asset_manager = Arc::clone(&self.asset_manager); + let updated_chunks = updated_node_chunks_iterator( + asset_manager.as_ref(), + self.change_set, + self.parent_id, + node.clone(), + Some(extent.clone()), + ) + .await + .map_ok(|(_path, chunk_info)| chunk_info); + self.write_manifest_from_iterator(updated_chunks, actual_extents).await + } - self.manifest_refs - .entry(node_id.clone()) - .and_modify(|v| v.push(new_ref.clone())) - .or_insert_with(|| vec![new_ref]); - } + async fn write_manifest_from_iterator( + &mut self, + chunks: impl Stream>, + actual_extents: ManifestExtents, + ) -> SessionResult> { + if let Some(new_manifest) = Manifest::from_stream(chunks).await.unwrap() { + let new_manifest = Arc::new(new_manifest); + let new_manifest_size = + self.asset_manager.write_manifest(Arc::clone(&new_manifest)).await?; + + let file_info = + ManifestFileInfo::new(new_manifest.as_ref(), new_manifest_size); + self.manifest_files.insert(file_info); + + let new_ref = ManifestRef { + object_id: new_manifest.id().clone(), + extents: actual_extents, + }; + Ok(Some(new_ref)) + } else { + Ok(None) } + } + fn finalize_refs( + &mut self, + node_id: &NodeId, + refs: HashMap, + ) -> SessionResult<()> { + for ref_ in refs.into_values() { + self.manifest_refs + .entry(node_id.clone()) + .and_modify(|v| v.push(ref_.clone())) + .or_insert_with(|| vec![ref_]); + } Ok(()) } - /// Write a manifest for a node that was created in this session /// It doesn't need to look at previous manifests because the node is new async fn write_manifest_for_new_node( &mut self, node_id: &NodeId, node_path: &Path, - splits: ManifestSplits, ) -> SessionResult<()> { - let chunks = stream::iter( - self.change_set.new_array_chunk_iterator(node_id, node_path).map(Ok), - ); - self.write_manifests_from_iterator(node_id, chunks, splits).await + let splits = self.change_set.splits(node_id).expect(&format!("logic bug, array at {} was added in this changeset, splits should be populated.", node_path)); + let mut refs = + HashMap::::with_capacity(splits.len()); + + // TODO: this could be try_fold with the refs HashMap as state + for extent in splits.iter() { + let cs_extents = self + .change_set + .array_manifest(node_id, extent) + .expect("logic bug. there should be a manifest for this extent ") + .extents(); + + let chunks = stream::iter( + self.change_set + .new_array_manifest_chunks_iterator(node_id, extent) + .map(Ok), + ); + self.write_manifest_from_iterator(chunks, cs_extents) + .await? + .map(|new_ref| refs.insert(extent.clone(), new_ref)); + } + self.finalize_refs(node_id, refs) } /// Write a manifest for a node that was modified in this session @@ -1577,18 +1638,82 @@ impl<'a> FlushProcess<'a> { &mut self, node: &NodeSnapshot, splits: ManifestSplits, + manifests: Vec, ) -> SessionResult<()> { - let asset_manager = Arc::clone(&self.asset_manager); - let updated_chunks = updated_node_chunks_iterator( - asset_manager.as_ref(), - self.change_set, - self.parent_id, - node.clone(), - ) - .await - .map_ok(|(_path, chunk_info)| chunk_info); + // populate with existing refs, if they are compatiblae + let mut refs = + HashMap::::with_capacity(splits.len()); + + let on_disk_extents = + manifests.iter().map(|m| m.extents.clone()).collect::>(); + + let modified_splits = self + .change_set + .array_manifests_iterator(&node.id, &node.path) + .map(|(extents, _)| extents) + .collect::>(); + + // FIXME: there is an invariant here + // ``modified_splits`` (i.e. splits used in this session) + // must be a subset of ``splits`` (the splits set in the config) + + // TODO: this should be try_fold with the refs HashMap as state + for extent in splits.iter() { + let on_disk_bbox = on_disk_extents + .iter() + .filter_map(|e| e.intersection(extent)) + .reduce(|a, b| a.union(&b)); + + if modified_splits.contains(extent) { + // this split was modified in this session, rewrite it completely + let cs_extents = self + .change_set + .array_manifest(&node.id, extent) + .expect("logic bug. there should be a manifest for this extent ") + .extents(); + let actual_extents = + // if there are splits on disk that overlap, then we take that Extents + // and union it with the Extents of the chunks the changeset + on_disk_bbox.map(|x| cs_extents.union(&x)) + // if no overlap, then just use the changeset Extents + .unwrap_or(cs_extents); + self.write_manifest_for_updated_chunks(&node, extent, actual_extents) + .await? + .map(|new_ref| refs.insert(extent.clone(), new_ref)); + } else { + // split was unmodified in this session. Let's look at the current manifests + // and see what we need to do with them + for old_ref in manifests.iter() { + // Remember that the extents written to disk are the `from`:`to` ranges + // of populated chunks + match overlaps(&old_ref.extents, extent) { + Overlap::Complete => { + debug_assert!(on_disk_bbox.is_some()); + // Just propagate this ref again, no rewriting necessary + refs.insert(extent.clone(), old_ref.clone()); + } + Overlap::Partial => { + // the splits have changed, but no refs in this split have been written in this session + // same as `if` block above + debug_assert!(on_disk_bbox.is_some()); + self.write_manifest_for_updated_chunks( + &node, + extent, + on_disk_bbox.clone().expect("logic bug in writing manifests from disk for partially overlapping split"), + ) + .await? + .map(|new_ref| refs.insert(extent.clone(), new_ref)); + } + Overlap::None => { + // Nothing to do + } + }; + } + } + } - self.write_manifests_from_iterator(&node.id, updated_chunks, splits).await + self.finalize_refs(&node.id, refs)?; + Ok(()) } /// Record the previous manifests for an array that was not modified in the session @@ -1636,69 +1761,65 @@ impl ManifestSplitDimCondition { } impl ManifestSplittingConfig { - pub fn get_split_sizes(&self, node: &NodeSnapshot) -> SessionResult { - match &node.node_data { - NodeData::Group => Err(SessionErrorKind::NotAnArray { - node: node.clone(), - message: "attempting to split manifest for group".to_string(), - } - .into()), - NodeData::Array { shape, dimension_names, .. } => { - let ndim = shape.len(); - let num_chunks = shape.num_chunks().collect::>(); - let mut edges: Vec> = - (0..ndim).map(|axis| vec![0, num_chunks[axis]]).collect(); - - // This is ugly but necessary to handle: - // - path: * - // manifest-split-size: - // - t : 10 - // - path: * - // manifest-split-size: - // - y : 2 - // which is now identical to: - // - path: * - // manifest-split-size: - // - t : 10 - // - y : 2 - let mut already_matched: HashSet = HashSet::new(); - - #[allow(clippy::expect_used)] - let split_sizes = self - .split_sizes + pub fn get_split_sizes( + &self, + path: &Path, + shape: &ArrayShape, + dimension_names: &Option>, + ) -> ManifestSplits { + let ndim = shape.len(); + let num_chunks = shape.num_chunks().collect::>(); + let mut edges: Vec> = + (0..ndim).map(|axis| vec![0, num_chunks[axis]]).collect(); + + // This is ugly but necessary to handle: + // - path: * + // manifest-split-size: + // - t : 10 + // - path: * + // manifest-split-size: + // - y : 2 + // which is now identical to: + // - path: * + // manifest-split-size: + // - t : 10 + // - y : 2 + let mut already_matched: HashSet = HashSet::new(); + + #[allow(clippy::expect_used)] + let split_sizes = self + .split_sizes + .clone() + .or_else(|| Self::default().split_sizes) + .expect("logic bug in grabbing split sizes from ManifestSplittingConfig"); + + for (condition, dim_specs) in split_sizes.iter() { + if condition.matches(path) { + let dimension_names = dimension_names .clone() - .or_else(|| Self::default().split_sizes) - .expect("logic bug"); - - for (condition, dim_specs) in split_sizes.iter() { - if condition.matches(&node.path) { - let dimension_names = dimension_names.clone().unwrap_or( - repeat_n(DimensionName::NotSpecified, ndim).collect(), - ); - for (axis, dimname) in itertools::enumerate(dimension_names) { - if already_matched.contains(&axis) { - continue; - } - for ManifestSplitDim { - condition: dim_condition, - num_chunks: split_size, - } in dim_specs.iter() - { - if dim_condition.matches(axis, dimname.clone().into()) { - edges[axis] = uniform_manifest_split_edges( - num_chunks[axis], - split_size, - ); - already_matched.insert(axis); - break; - }; - } - } + .unwrap_or(repeat_n(DimensionName::NotSpecified, ndim).collect()); + for (axis, dimname) in itertools::enumerate(dimension_names) { + if already_matched.contains(&axis) { + continue; + } + for ManifestSplitDim { + condition: dim_condition, + num_chunks: split_size, + } in dim_specs.iter() + { + if dim_condition.matches(axis, dimname.clone().into()) { + edges[axis] = uniform_manifest_split_edges( + num_chunks[axis], + split_size, + ); + already_matched.insert(axis); + break; + }; } } - Ok(ManifestSplits::from_edges(edges)) } } + ManifestSplits::from_edges(edges) } } @@ -1739,8 +1860,18 @@ async fn flush( &node.path, ) .await?; - let splits = splitting_config.get_split_sizes(&new_node)?; - flush_data.write_manifest_for_existing_node(&node, splits).await?; + if let NodeData::Array { shape, dimension_names, manifests } = + new_node.node_data + { + let splits = splitting_config.get_split_sizes( + &new_node.path, + &shape, + &dimension_names, + ); + flush_data + .write_manifest_for_existing_node(&node, splits, manifests) + .await?; + } } else { trace!(path=%node.path, "Node has no changes, keeping the previous manifest"); // Array wasn't deleted but has no changes in this session @@ -1754,15 +1885,7 @@ async fn flush( for (node_path, node_id) in flush_data.change_set.new_arrays() { trace!(path=%node_path, "New node, writing a manifest"); - let node = get_node( - &flush_data.asset_manager, - flush_data.change_set, - flush_data.parent_id, - node_path, - ) - .await?; - let splits = splitting_config.get_split_sizes(&node)?; - flush_data.write_manifest_for_new_node(node_id, node_path, splits).await?; + flush_data.write_manifest_for_new_node(node_id, node_path).await?; } trace!("Building new snapshot"); @@ -1942,13 +2065,17 @@ mod tests { refs::{Ref, fetch_tag}, repository::VersionInfo, storage::new_in_memory_storage, - strategies::{ShapeDim, empty_writable_session, node_paths, shapes_and_dims}, + strategies::{ + ShapeDim, empty_writable_session, manifest_extents, node_paths, + shapes_and_dims, + }, }; use super::*; use icechunk_macros::tokio_test; - use itertools::Itertools; + use itertools::{Itertools, all, multizip}; use pretty_assertions::assert_eq; + use proptest::collection::vec; use proptest::prelude::{prop_assert, prop_assert_eq}; use storage::logging::LoggingStorage; use test_strategy::proptest; @@ -2108,6 +2235,149 @@ mod tests { prop_assert!(session.delete_group(path.clone()).await.is_ok()); } + #[proptest] + fn test_property_extents_set_ops_same( + #[strategy(manifest_extents(4))] e: ManifestExtents, + ) { + prop_assert_eq!(e.intersection(&e), Some(e.clone())); + prop_assert_eq!(e.union(&e), e.clone()); + prop_assert_eq!(overlaps(&e, &e), Overlap::Complete); + } + + #[proptest] + fn test_property_extents_set_ops( + #[strategy(manifest_extents(4))] e1: ManifestExtents, + #[strategy(manifest_extents(4))] e2: ManifestExtents, + ) { + let union = e1.union(&e2); + let intersection = e1.intersection(&e2); + + prop_assert_eq!(e1.intersection(&union), Some(e1.clone())); + prop_assert_eq!(union.intersection(&e1), Some(e1.clone())); + prop_assert_eq!(e2.intersection(&union), Some(e2.clone())); + prop_assert_eq!(union.intersection(&e2), Some(e2.clone())); + + // order is important for the next 2 + prop_assert_eq!(overlaps(&union, &e1), Overlap::Complete); + prop_assert_eq!(overlaps(&union, &e2), Overlap::Complete); + + if intersection.is_some() { + let int = intersection.unwrap(); + let expected = if e1 == e1 { Overlap::Complete } else { Overlap::Partial }; + prop_assert_eq!(overlaps(&e1, &int), expected.clone()); + prop_assert_eq!(overlaps(&e2, &int), expected); + } else { + prop_assert_eq!(overlaps(&e1, &e2), Overlap::None); + prop_assert_eq!(overlaps(&e2, &e1), Overlap::None); + } + } + + #[proptest] + fn test_property_extents_widths( + #[strategy(manifest_extents(4))] extent1: ManifestExtents, + #[strategy(vec(0..100, 4))] delta_left: Vec, + #[strategy(vec(0..100, 4))] delta_right: Vec, + ) { + let widths = extent1.iter().map(|r| (r.end - r.start) as i32).collect::>(); + let extent2 = ManifestExtents::from_ranges_iter( + multizip((extent1.iter(), delta_left.iter(), delta_right.iter())).map( + |(extent, dleft, dright)| { + ((extent.start as i32 + dleft) as u32) + ..((extent.end as i32 + dright) as u32) + }, + ), + ); + + if all(delta_left.iter(), |elem| elem == &0i32) + && all(delta_right.iter(), |elem| elem == &0i32) + { + prop_assert_eq!(overlaps(&extent1, &extent2), Overlap::Complete); + } + + let extent2 = ManifestExtents::from_ranges_iter( + multizip(( + extent1.iter(), + widths.iter(), + delta_left.iter(), + delta_right.iter(), + )) + .map(|(extent, width, dleft, dright)| { + let (low, high) = (dleft.min(dright), dleft.max(dright)); + ((extent.start as i32 + width + low) as u32) + ..((extent.end as i32 + width + high) as u32) + }), + ); + + prop_assert_eq!(overlaps(&extent1, &extent2), Overlap::None); + + let extent2 = ManifestExtents::from_ranges_iter( + multizip(( + extent1.iter(), + widths.iter(), + delta_left.iter(), + delta_right.iter(), + )) + .map(|(extent, width, dleft, dright)| { + let (low, high) = (dleft.min(dright), dleft.max(dright)); + ((extent.start as i32 - width - high).max(0i32) as u32) + ..((extent.end as i32 - width - low) as u32) + }), + ); + prop_assert_eq!(overlaps(&extent1, &extent2), Overlap::None); + + let extent2 = ManifestExtents::from_ranges_iter( + multizip((extent1.iter(), delta_left.iter(), delta_right.iter())).map( + |(extent, dleft, dright)| { + ((extent.start as i32 - dleft - 1).max(0i32) as u32) + ..((extent.end as i32 + dright + 1) as u32) + }, + ), + ); + prop_assert_eq!(overlaps(&extent1, &extent2), Overlap::Partial); + } + + #[icechunk_macros::test] + fn test_overlaps() -> Result<(), Box> { + let e1 = ManifestExtents::new( + vec![0u32, 1, 2].as_slice(), + vec![2u32, 4, 6].as_slice(), + ); + + let e2 = ManifestExtents::new( + vec![10u32, 1, 2].as_slice(), + vec![12u32, 4, 6].as_slice(), + ); + + let union = ManifestExtents::new( + vec![0u32, 1, 2].as_slice(), + vec![12u32, 4, 6].as_slice(), + ); + + assert_eq!(overlaps(&e1, &e2), Overlap::None); + assert_eq!(e1.intersection(&e2), None); + assert_eq!(e1.union(&e2), union); + + + let e1 = ManifestExtents::new( + vec![0u32, 1, 2].as_slice(), + vec![2u32, 4, 6].as_slice(), + ); + let e2 = ManifestExtents::new( + vec![2u32, 1, 2].as_slice(), + vec![42u32, 4, 6].as_slice(), + ); + assert_eq!(overlaps(&e1, &e2), Overlap::None); + assert_eq!(overlaps(&e2, &e1), Overlap::None); + + // this should create non-overlapping extents + let splits = ManifestSplits::from_edges(vec![vec![0, 10, 20], vec![0, 1, 2], vec![0, 21, 22]]); + for vec in splits.iter().combinations(2) { + assert_eq!(overlaps(vec[0], vec[1]), Overlap::None) + } + + Ok(()) + } + #[tokio::test] async fn test_which_split() -> Result<(), Box> { let splits = ManifestSplits::from_edges(vec![vec![0, 10, 20]]); diff --git a/icechunk/src/strategies.rs b/icechunk/src/strategies.rs index 2b3a20310..350a7f57f 100644 --- a/icechunk/src/strategies.rs +++ b/icechunk/src/strategies.rs @@ -7,11 +7,14 @@ use proptest::prelude::*; use proptest::{collection::vec, option, strategy::Strategy}; use crate::Repository; +use crate::format::manifest::ManifestExtents; use crate::format::snapshot::{ArrayShape, DimensionName}; use crate::format::{ChunkIndices, Path}; use crate::session::Session; use crate::storage::new_in_memory_storage; +const MAX_NDIM: usize = 4; + pub fn node_paths() -> impl Strategy { // FIXME: Add valid paths #[allow(clippy::expect_used)] @@ -66,7 +69,7 @@ pub struct ShapeDim { pub fn shapes_and_dims(max_ndim: Option) -> impl Strategy { // FIXME: ndim = 0 - let max_ndim = max_ndim.unwrap_or(4usize); + let max_ndim = max_ndim.unwrap_or(MAX_NDIM); vec(1u64..26u64, 1..max_ndim) .prop_flat_map(|shape| { let ndim = shape.len(); @@ -97,6 +100,15 @@ pub fn shapes_and_dims(max_ndim: Option) -> impl Strategy impl Strategy { + (vec(0u32..1000u32, ndim), vec(0u32..1000u32, ndim)).prop_map(|(start, delta)| { + let stop = std::iter::zip(start.iter(), delta.iter()) + .map(|(s, d)| s + d) + .collect::>(); + ManifestExtents::new(start.as_slice(), stop.as_slice()) + }) +} + //prop_compose! { // pub fn zarr_array_metadata()( // chunk_key_encoding: ChunkKeyEncoding, From 3a19559fbfa91c8ec9c8c4b75a7ab8aa040b8b01 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 6 Jun 2025 13:11:14 -0600 Subject: [PATCH 02/43] Refactor to track splitting on Session --- icechunk/src/change_set.rs | 99 ++++++---------------- icechunk/src/session.rs | 168 +++++++++++++++++++++++++------------ 2 files changed, 141 insertions(+), 126 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index d42830b98..e48aec6e0 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -8,13 +8,12 @@ use itertools::{Either, Itertools as _}; use serde::{Deserialize, Serialize}; use crate::{ - config::ManifestSplittingConfig, format::{ ChunkIndices, NodeId, Path, manifest::{ChunkInfo, ChunkPayload, ManifestExtents, ManifestSplits}, snapshot::{ArrayShape, DimensionName, NodeData, NodeSnapshot}, }, - session::SessionResult, + session::{SessionResult, which_extent_and_index}, }; #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] @@ -24,15 +23,6 @@ pub struct ArrayData { pub user_data: Bytes, } -impl ManifestSplits { - pub fn which_extent(&self, coord: &ChunkIndices) -> SessionResult<&ManifestExtents> { - Ok(self.0.get(self.which(coord)?).expect(&format!( - "logic bug, could not find ManifestExtents for this coordinate: {:?}", - coord - ))) - } -} - #[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)] pub struct SplitManifest { from: Vec, @@ -85,12 +75,6 @@ impl SplitManifest { #[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)] pub struct ChangeSet { - // splitting configuration is recorded at the time the writable session is created - // we ignore any succeeding changes in repository config. - splitting: ManifestSplittingConfig, - // This is an optimization so that we needn't figure out the split sizes on every set. - // TODO: consider merging with `set_chunks` BTreeMap - splits: HashMap, new_groups: HashMap, new_arrays: HashMap, updated_arrays: HashMap, @@ -103,10 +87,6 @@ pub struct ChangeSet { } impl ChangeSet { - pub fn new(splitting: ManifestSplittingConfig) -> Self { - Self { splitting, ..Default::default() } - } - pub fn deleted_arrays(&self) -> impl Iterator { self.deleted_arrays.iter() } @@ -127,10 +107,6 @@ impl ChangeSet { self.deleted_arrays.contains(path_and_id) } - pub fn splits(&self, id: &NodeId) -> Option<&ManifestSplits> { - self.splits.get(id) - } - pub fn chunk_changes( &self, ) -> impl Iterator)> { @@ -174,39 +150,11 @@ impl ChangeSet { } } - fn maybe_update_cached_splits( - &mut self, - node_id: &NodeId, - path: &Path, - shape: &ArrayShape, - dimension_names: &Option>, - ) { - if !self.splits.contains_key(node_id) { - // Q: What happens if we set a chunk, then change a dimension name, so - // that the split changes. - // A: We ignore it. splits are set once for a node in a session, and are never changed. - let splits = self.splitting.get_split_sizes(path, shape, dimension_names); - self.splits.insert(node_id.clone(), splits); - } - } - pub fn add_array(&mut self, path: Path, node_id: NodeId, array_data: ArrayData) { - self.maybe_update_cached_splits( - &node_id, - &path, - &array_data.shape, - &array_data.dimension_names, - ); self.new_arrays.insert(path, (node_id, array_data)); } pub fn update_array(&mut self, node_id: &NodeId, path: &Path, array_data: ArrayData) { - self.maybe_update_cached_splits( - &node_id, - &path, - &array_data.shape, - &array_data.dimension_names, - ); match self.new_arrays.get(path) { Some((id, _)) => { debug_assert!(!self.updated_arrays.contains_key(id)); @@ -241,7 +189,6 @@ impl ChangeSet { self.updated_arrays.remove(node_id); self.set_chunks.remove(node_id); - self.splits.remove(node_id); if !is_new_array { self.deleted_arrays.insert((path, node_id.clone())); } @@ -269,13 +216,9 @@ impl ChangeSet { node_id: NodeId, coord: ChunkIndices, data: Option, + splits: &ManifestSplits, ) { - let cached_splits = self.splits.get(&node_id).expect(&format!( - "logic bug. change_set.splits should be populated for node {}", - node_id - )); - - let extent = cached_splits.which_extent(&coord).expect("logic bug. Trying to set chunk ref but can't find the appropriate split manifest."); + let (_, extent) = splits.which_extent_and_index(&coord).expect("logic bug. Trying to set chunk ref but can't find the appropriate split manifest."); // this implementation makes delete idempotent // it allows deleting a deleted chunk by repeatedly setting None. self.set_chunks @@ -285,7 +228,7 @@ impl ChangeSet { }) .or_insert_with(|| { let mut h = HashMap::::with_capacity( - cached_splits.len(), + splits.len(), ); h.entry(extent.clone()) // TODO: this is duplicative. I can't use `or_default` because it's @@ -301,17 +244,16 @@ impl ChangeSet { node_id: &NodeId, coords: &ChunkIndices, ) -> Option<&Option> { - self.splits - .get(node_id) - .and_then(|splits| { - splits.which_extent(coords).ok().map(|extent| { - self.set_chunks - .get(node_id) - .and_then(|h| h.get(&extent)) - .and_then(|s| s.chunks.get(coords)) + if let Some(node_chunks) = self.set_chunks.get(node_id) { + which_extent_and_index(node_chunks.keys(), coords) + .ok() + .map(|(_, extent)| { + node_chunks.get(&extent).and_then(|s| s.chunks.get(coords)) }) - }) - .flatten() + .flatten() + } else { + None + } } /// Drop the updated chunk references for the node. @@ -599,7 +541,7 @@ mod tests { change_set::ArrayData, format::{ ChunkIndices, NodeId, - manifest::{ChunkInfo, ChunkPayload}, + manifest::{ChunkInfo, ChunkPayload, ManifestSplits}, snapshot::ArrayShape, }, }; @@ -631,28 +573,39 @@ mod tests { ); assert_eq!(None, change_set.new_arrays_chunk_iterator().next()); - change_set.set_chunk_ref(node_id1.clone(), ChunkIndices(vec![0, 1]), None); + let splits = ManifestSplits::from_edges(vec![vec![0, 10], vec![0, 10]]); + + change_set.set_chunk_ref( + node_id1.clone(), + ChunkIndices(vec![0, 1]), + None, + &splits, + ); assert_eq!(None, change_set.new_arrays_chunk_iterator().next()); change_set.set_chunk_ref( node_id1.clone(), ChunkIndices(vec![1, 0]), Some(ChunkPayload::Inline("bar1".into())), + &splits, ); change_set.set_chunk_ref( node_id1.clone(), ChunkIndices(vec![1, 1]), Some(ChunkPayload::Inline("bar2".into())), + &splits, ); change_set.set_chunk_ref( node_id2.clone(), ChunkIndices(vec![0]), Some(ChunkPayload::Inline("baz1".into())), + &splits, ); change_set.set_chunk_ref( node_id2.clone(), ChunkIndices(vec![1]), Some(ChunkPayload::Inline("baz2".into())), + &splits, ); { diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 8175242a4..e1650a74b 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -161,31 +161,45 @@ impl From for SessionError { pub type SessionResult = Result; +// Returns the index of split_range that includes ChunkIndices +// This can be used at write time to split manifests based on the config +// and at read time to choose which manifest to query for chunk payload +pub fn which_extent_and_index<'a>( + iter: impl Iterator, + coord: &ChunkIndices, +) -> SessionResult<(usize, ManifestExtents)> { + // split_range[i] must bound ChunkIndices + // 0 <= return value <= split_range.len() + // it is possible that split_range does not include a coord. say we have 2x2 split grid + // but only split (0,0) and split (1,1) are populated with data. + // A coord located in (1, 0) should return Err + // Since split_range need not form a regular grid, we must iterate through and find the first result. + // ManifestExtents in split_range MUST NOT overlap with each other. How do we ensure this? + // ndim must be the same + // debug_assert_eq!(coord.0.len(), split_range[0].len()); + // FIXME: could optimize for unbounded single manifest + // Note: I don't think we can distinguish between out of bounds index for the array + // and an index that is part of a split that hasn't been written yet. + iter.enumerate() + .find(|(_, e)| e.contains(coord.0.as_slice())) + .map(|(i, e)| (i, e.clone())) + .ok_or( + SessionErrorKind::InvalidIndexForSplitManifests { coords: coord.clone() } + .into(), + ) +} + impl ManifestSplits { - // Returns the index of split_range that includes ChunkIndices - // This can be used at write time to split manifests based on the config - // and at read time to choose which manifest to query for chunk payload - pub fn which(&self, coord: &ChunkIndices) -> SessionResult { - // split_range[i] must bound ChunkIndices - // 0 <= return value <= split_range.len() - // it is possible that split_range does not include a coord. say we have 2x2 split grid - // but only split (0,0) and split (1,1) are populated with data. - // A coord located in (1, 0) should return Err - // Since split_range need not form a regular grid, we must iterate through and find the first result. - // ManifestExtents in split_range MUST NOT overlap with each other. How do we ensure this? - // ndim must be the same - // debug_assert_eq!(coord.0.len(), split_range[0].len()); - // FIXME: could optimize for unbounded single manifest - // Note: I don't think we can distinguish between out of bounds index for the array - // and an index that is part of a split that hasn't been written yet. - self.iter() - .enumerate() - .find(|(_, e)| e.contains(coord.0.as_slice())) - .map(|(i, _)| i) - .ok_or( - SessionErrorKind::InvalidIndexForSplitManifests { coords: coord.clone() } - .into(), - ) + pub fn which_extent_and_index( + &self, + coord: &ChunkIndices, + ) -> SessionResult<(usize, ManifestExtents)> { + which_extent_and_index(self.iter(), coord) + } + + #[cfg(test)] + pub fn which_index(&self, coord: &ChunkIndices) -> SessionResult { + which_extent_and_index(self.iter(), coord).map(|(i, _)| i) } } @@ -234,6 +248,8 @@ pub struct Session { snapshot_id: SnapshotId, change_set: ChangeSet, default_commit_metadata: SnapshotProperties, + // This is an optimization so that we needn't figure out the split sizes on every set. + splits: HashMap, } impl Session { @@ -255,6 +271,7 @@ impl Session { snapshot_id, change_set: ChangeSet::default(), default_commit_metadata: SnapshotProperties::default(), + splits: Default::default(), } } @@ -269,7 +286,6 @@ impl Session { snapshot_id: SnapshotId, default_commit_metadata: SnapshotProperties, ) -> Self { - let splitting = config.manifest().splitting().clone(); Self { config, storage_settings: Arc::new(storage_settings), @@ -278,8 +294,9 @@ impl Session { virtual_resolver, branch_name: Some(branch_name), snapshot_id, - change_set: ChangeSet::new(splitting), + change_set: Default::default(), default_commit_metadata, + splits: Default::default(), } } @@ -419,6 +436,7 @@ impl Session { match self.get_node(&path).await { Err(SessionError { kind: SessionErrorKind::NodeNotFound { .. }, .. }) => { let id = NodeId::random(); + self.cache_splits(&id, &path, &shape, &dimension_names); self.change_set.add_array( path, id, @@ -447,6 +465,7 @@ impl Session { user_data: Bytes, ) -> SessionResult<()> { self.get_array(path).await.map(|node| { + self.cache_splits(&node.id, path, &shape, &dimension_names); self.change_set.update_array( &node.id, path, @@ -511,6 +530,34 @@ impl Session { self.set_node_chunk_ref(node_snapshot, coord, data).await } + fn cache_splits( + &mut self, + node_id: &NodeId, + path: &Path, + shape: &ArrayShape, + dimension_names: &Option>, + ) { + let splitting = self.config.manifest().splitting(); + // Q: What happens if we set a chunk, then change a dimension name, so + // that the split changes. + // A: We ignore it. splits are set once for a node in a session, and are never changed. + let splits = splitting.get_split_sizes(path, shape, dimension_names); + self.splits.insert(node_id.clone(), splits); + } + + fn get_splits( + &mut self, + node_id: &NodeId, + path: &Path, + shape: &ArrayShape, + dimension_names: &Option>, + ) -> &ManifestSplits { + if !self.splits.contains_key(node_id) { + self.cache_splits(node_id, path, shape, dimension_names); + } + self.splits.get(node_id).expect("should not be possible.") + } + // Helper function that accepts a NodeSnapshot instead of a path, // this lets us do bulk sets (and deletes) without repeatedly grabbing the node. #[instrument(skip(self))] @@ -520,9 +567,14 @@ impl Session { coord: ChunkIndices, data: Option, ) -> SessionResult<()> { - if let NodeData::Array { shape, .. } = node.node_data { + if let NodeData::Array { shape, dimension_names, .. } = node.node_data { if shape.valid_chunk_coord(&coord) { - self.change_set.set_chunk_ref(node.id, coord, data); + let splits = self + .get_splits(&node.id, &node.path, &shape, &dimension_names) + // FIXME: this clone is a workaround for two mutable borrows + // on self.change_set + .clone(); + self.change_set.set_chunk_ref(node.id, coord, data, &splits); Ok(()) } else { Err(SessionErrorKind::InvalidIndex { @@ -784,7 +836,7 @@ impl Session { let splits = ManifestSplits::from_extents( manifests.iter().map(|m| m.extents.clone()).collect(), ); - let index = match splits.which(coords) { + let (index, _) = match splits.which_extent_and_index(coords) { Ok(index) => index, // for an invalid coordinate, we bail. // This happens for two cases: @@ -925,6 +977,7 @@ impl Session { &self.config, message, Some(properties), + &self.splits, ) .await } @@ -948,6 +1001,7 @@ impl Session { &self.config, message, Some(properties), + &self.splits, ) .await } @@ -1523,6 +1577,7 @@ struct FlushProcess<'a> { change_set: &'a ChangeSet, parent_id: &'a SnapshotId, config: &'a RepositoryConfig, + splits: &'a HashMap, manifest_refs: HashMap>, manifest_files: HashSet, } @@ -1533,12 +1588,14 @@ impl<'a> FlushProcess<'a> { change_set: &'a ChangeSet, parent_id: &'a SnapshotId, config: &'a RepositoryConfig, + splits: &'a HashMap, ) -> Self { Self { asset_manager, change_set, parent_id, config, + splits, manifest_refs: Default::default(), manifest_files: Default::default(), } @@ -1605,28 +1662,28 @@ impl<'a> FlushProcess<'a> { async fn write_manifest_for_new_node( &mut self, node_id: &NodeId, - node_path: &Path, ) -> SessionResult<()> { - let splits = self.change_set.splits(node_id).expect(&format!("logic bug, array at {} was added in this changeset, splits should be populated.", node_path)); + let splits = self.splits.get(node_id).expect(&format!( + "getting split for node {} unexpectedly failed", + node_id.clone() + )); + let mut refs = HashMap::::with_capacity(splits.len()); // TODO: this could be try_fold with the refs HashMap as state for extent in splits.iter() { - let cs_extents = self - .change_set - .array_manifest(node_id, extent) - .expect("logic bug. there should be a manifest for this extent ") - .extents(); - - let chunks = stream::iter( - self.change_set - .new_array_manifest_chunks_iterator(node_id, extent) - .map(Ok), - ); - self.write_manifest_from_iterator(chunks, cs_extents) - .await? - .map(|new_ref| refs.insert(extent.clone(), new_ref)); + if let Some(manifest) = self.change_set.array_manifest(node_id, extent) { + let cs_extents = manifest.extents(); + let chunks = stream::iter( + self.change_set + .new_array_manifest_chunks_iterator(node_id, extent) + .map(Ok), + ); + self.write_manifest_from_iterator(chunks, cs_extents) + .await? + .map(|new_ref| refs.insert(extent.clone(), new_ref)); + } } self.finalize_refs(node_id, refs) } @@ -1885,7 +1942,7 @@ async fn flush( for (node_path, node_id) in flush_data.change_set.new_arrays() { trace!(path=%node_path, "New node, writing a manifest"); - flush_data.write_manifest_for_new_node(node_id, node_path).await?; + flush_data.write_manifest_for_new_node(node_id).await?; } trace!("Building new snapshot"); @@ -2001,11 +2058,13 @@ async fn do_commit( config: &RepositoryConfig, message: &str, properties: Option, + splits: &HashMap, ) -> SessionResult { info!(branch_name, old_snapshot_id=%snapshot_id, "Commit started"); let parent_snapshot = snapshot_id.clone(); let properties = properties.unwrap_or_default(); - let flush_data = FlushProcess::new(asset_manager, change_set, snapshot_id, config); + let flush_data = + FlushProcess::new(asset_manager, change_set, snapshot_id, config, splits); let new_snapshot = flush(flush_data, message, properties).await?; debug!(branch_name, new_snapshot_id=%new_snapshot, "Updating branch"); @@ -2382,16 +2441,16 @@ mod tests { async fn test_which_split() -> Result<(), Box> { let splits = ManifestSplits::from_edges(vec![vec![0, 10, 20]]); - assert_eq!(splits.which(&ChunkIndices(vec![1])).unwrap(), 0); - assert_eq!(splits.which(&ChunkIndices(vec![11])).unwrap(), 1); + assert_eq!(splits.which_index(&ChunkIndices(vec![1])).unwrap(), 0); + assert_eq!(splits.which_index(&ChunkIndices(vec![11])).unwrap(), 1); let edges = vec![vec![0, 10, 20], vec![0, 10, 20]]; let splits = ManifestSplits::from_edges(edges); - assert_eq!(splits.which(&ChunkIndices(vec![1, 1])).unwrap(), 0); - assert_eq!(splits.which(&ChunkIndices(vec![1, 10])).unwrap(), 1); - assert_eq!(splits.which(&ChunkIndices(vec![1, 11])).unwrap(), 1); - assert!(splits.which(&ChunkIndices(vec![21, 21])).is_err()); + assert_eq!(splits.which_index(&ChunkIndices(vec![1, 1])).unwrap(), 0); + assert_eq!(splits.which_index(&ChunkIndices(vec![1, 10])).unwrap(), 1); + assert_eq!(splits.which_index(&ChunkIndices(vec![1, 11])).unwrap(), 1); + assert!(splits.which_index(&ChunkIndices(vec![21, 21])).is_err()); Ok(()) } @@ -3180,8 +3239,10 @@ mod tests { ds.add_array(a2path.clone(), shape.clone(), dimension_names.clone(), def.clone()) .await?; + dbg!("added arrays, now commit"); let _ = ds.commit("first commit", None).await?; + dbg!("committed arrays"); // there should be no manifests yet because we didn't add any chunks assert_eq!( 0, @@ -3206,6 +3267,7 @@ mod tests { let mut ds = repo.writable_session("main").await?; + dbg!("setting chunk ref"); // add 3 chunks ds.set_chunk_ref( a1path.clone(), From 09cd44590f265b20431412d3fbc8414e860743f0 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 6 Jun 2025 16:08:50 -0600 Subject: [PATCH 03/43] Revert to aggregate_extents --- icechunk/src/change_set.rs | 78 ++++---------------- icechunk/src/repository.rs | 1 - icechunk/src/session.rs | 147 ++++++++++++++++++++++++++++--------- 3 files changed, 125 insertions(+), 101 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index e48aec6e0..ae5fc45a5 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -23,64 +23,13 @@ pub struct ArrayData { pub user_data: Bytes, } -#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)] -pub struct SplitManifest { - from: Vec, - to: Vec, - // It's important we keep these sorted, we use this fact in TransactionLog creation - chunks: BTreeMap>, -} - -impl SplitManifest { - pub fn update(&mut self, coord: ChunkIndices, data: Option) { - if self.from.is_empty() { - debug_assert!(self.to.is_empty()); - debug_assert!(self.chunks.is_empty()); - // important to remember that `to` is not inclusive, so we need +1 - let mut coord0 = coord.0.clone(); - self.to.extend(coord0.iter().map(|n| *n + 1)); - self.from.append(&mut coord0); - } else { - for (existing, coord0) in self.from.iter_mut().zip(coord.0.iter()) { - if coord0 < existing { - *existing = *coord0 - } - } - for (existing, coord0) in self.to.iter_mut().zip(coord.0.iter()) { - // important to remember that `to` is not inclusive, so we need +1 - let range_value = coord0 + 1; - if range_value > *existing { - *existing = range_value - } - } - } - self.chunks.insert(coord, data); - } - - pub fn retain(&mut self, predicate: impl Fn(&ChunkIndices) -> bool) { - self.chunks.retain(|coord, _| { - if !predicate(coord) { - // FIXME: handle from, to updating - todo!(); - } else { - false - } - }) - } - - pub fn extents(&self) -> ManifestExtents { - ManifestExtents::new(self.from.as_slice(), self.to.as_slice()) - } -} - +type SplitManifest = BTreeMap>; #[derive(Clone, Debug, PartialEq, Eq, Default, Serialize, Deserialize)] pub struct ChangeSet { new_groups: HashMap, new_arrays: HashMap, updated_arrays: HashMap, updated_groups: HashMap, - // FIXME: It's important we keep these sorted, we use this fact in TransactionLog creation - // Change HashMap -> BTreeMap, need to check Ord on ManifestExtents set_chunks: BTreeMap>, deleted_groups: HashSet<(Path, NodeId)>, deleted_arrays: HashSet<(Path, NodeId)>, @@ -111,7 +60,7 @@ impl ChangeSet { &self, ) -> impl Iterator)> { self.set_chunks.iter().map(|(node_id, split_map)| { - (node_id, split_map.values().flat_map(|x| x.chunks.keys())) + (node_id, split_map.values().flat_map(|x| x.keys())) }) } @@ -224,17 +173,18 @@ impl ChangeSet { self.set_chunks .entry(node_id) .and_modify(|h| { - h.entry(extent.clone()).or_default().update(coord.clone(), data.clone()); + h.entry(extent.clone()).or_default().insert(coord.clone(), data.clone()); }) .or_insert_with(|| { - let mut h = HashMap::::with_capacity( - splits.len(), - ); + let mut h = HashMap::< + ManifestExtents, + BTreeMap>, + >::with_capacity(splits.len()); h.entry(extent.clone()) // TODO: this is duplicative. I can't use `or_default` because it's // nice to create the HashMap using `with_capacity` .or_default() - .update(coord, data); + .insert(coord, data); h }); } @@ -247,9 +197,7 @@ impl ChangeSet { if let Some(node_chunks) = self.set_chunks.get(node_id) { which_extent_and_index(node_chunks.keys(), coords) .ok() - .map(|(_, extent)| { - node_chunks.get(&extent).and_then(|s| s.chunks.get(coords)) - }) + .map(|(_, extent)| node_chunks.get(&extent).and_then(|s| s.get(coords))) .flatten() } else { None @@ -261,11 +209,11 @@ impl ChangeSet { pub fn drop_chunk_changes( &mut self, node_id: &NodeId, - predicate: impl Fn(&ChunkIndices) -> bool + Copy, + predicate: impl Fn(&ChunkIndices) -> bool, ) { if let Some(changes) = self.set_chunks.get_mut(node_id) { for split in changes.values_mut() { - split.retain(predicate); + split.retain(|coord, _| !predicate(coord)); } } } @@ -287,7 +235,7 @@ impl ChangeSet { .filter(move |(manifest_extent, _)| { extent.is_none() || Some(*manifest_extent) == extent.as_ref() }) - .flat_map(|(_, manifest)| manifest.chunks.iter()), + .flat_map(|(_, manifest)| manifest.iter()), ), } } @@ -345,7 +293,7 @@ impl ChangeSet { extent: &ManifestExtents, ) -> impl Iterator + use<'a> { if let Some(manifest) = self.array_manifest(node_id, extent) { - Either::Right(manifest.chunks.iter().filter_map(move |(coords, payload)| { + Either::Right(manifest.iter().filter_map(move |(coords, payload)| { payload.as_ref().map(|p| ChunkInfo { node: node_id.clone(), coord: coords.clone(), diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs index 54668c50c..d778dc9cf 100644 --- a/icechunk/src/repository.rs +++ b/icechunk/src/repository.rs @@ -1488,7 +1488,6 @@ mod tests { add += (axis_size as u32).div_ceil(expected_split_sizes[ax]) as usize - 1 * ((ax > 0) as usize); - dbg!(&ax, &add); total_manifests += add; session.commit(format!("finished axis {0}", ax).as_ref(), None).await?; assert_manifest_count(&backend, total_manifests).await; diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index e1650a74b..2ed81e661 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -1605,7 +1605,6 @@ impl<'a> FlushProcess<'a> { &mut self, node: &NodeSnapshot, extent: &ManifestExtents, - actual_extents: ManifestExtents, ) -> SessionResult> { let asset_manager = Arc::clone(&self.asset_manager); let updated_chunks = updated_node_chunks_iterator( @@ -1617,14 +1616,17 @@ impl<'a> FlushProcess<'a> { ) .await .map_ok(|(_path, chunk_info)| chunk_info); - self.write_manifest_from_iterator(updated_chunks, actual_extents).await + self.write_manifest_from_iterator(updated_chunks).await } async fn write_manifest_from_iterator( &mut self, chunks: impl Stream>, - actual_extents: ManifestExtents, ) -> SessionResult> { + let mut from = vec![]; + let mut to = vec![]; + let chunks = aggregate_extents(&mut from, &mut to, chunks, |ci| &ci.coord); + if let Some(new_manifest) = Manifest::from_stream(chunks).await.unwrap() { let new_manifest = Arc::new(new_manifest); let new_manifest_size = @@ -1636,7 +1638,7 @@ impl<'a> FlushProcess<'a> { let new_ref = ManifestRef { object_id: new_manifest.id().clone(), - extents: actual_extents, + extents: ManifestExtents::new(&from, &to), }; Ok(Some(new_ref)) } else { @@ -1673,14 +1675,13 @@ impl<'a> FlushProcess<'a> { // TODO: this could be try_fold with the refs HashMap as state for extent in splits.iter() { - if let Some(manifest) = self.change_set.array_manifest(node_id, extent) { - let cs_extents = manifest.extents(); + if self.change_set.array_manifest(node_id, extent).is_some() { let chunks = stream::iter( self.change_set .new_array_manifest_chunks_iterator(node_id, extent) .map(Ok), ); - self.write_manifest_from_iterator(chunks, cs_extents) + self.write_manifest_from_iterator(chunks) .await? .map(|new_ref| refs.insert(extent.clone(), new_ref)); } @@ -1716,28 +1717,17 @@ impl<'a> FlushProcess<'a> { // TODO: this should be try_fold with the refs HashMap as state for extent in splits.iter() { - let on_disk_bbox = on_disk_extents - .iter() - .filter_map(|e| e.intersection(extent)) - .reduce(|a, b| a.union(&b)); - if modified_splits.contains(extent) { // this split was modified in this session, rewrite it completely - let cs_extents = self - .change_set - .array_manifest(&node.id, extent) - .expect("logic bug. there should be a manifest for this extent ") - .extents(); - let actual_extents = - // if there are splits on disk that overlap, then we take that Extents - // and union it with the Extents of the chunks the changeset - on_disk_bbox.map(|x| cs_extents.union(&x)) - // if no overlap, then just use the changeset Extents - .unwrap_or(cs_extents); - self.write_manifest_for_updated_chunks(&node, extent, actual_extents) + self.write_manifest_for_updated_chunks(&node, extent) .await? .map(|new_ref| refs.insert(extent.clone(), new_ref)); } else { + let on_disk_bbox = on_disk_extents + .iter() + .filter_map(|e| e.intersection(extent)) + .reduce(|a, b| a.union(&b)); + // split was unmodified in this session. Let's look at the current manifests // and see what we need to do with them for old_ref in manifests.iter() { @@ -1753,13 +1743,9 @@ impl<'a> FlushProcess<'a> { // the splits have changed, but no refs in this split have been written in this session // same as `if` block above debug_assert!(on_disk_bbox.is_some()); - self.write_manifest_for_updated_chunks( - &node, - extent, - on_disk_bbox.clone().expect("logic bug in writing manifests from disk for partially overlapping split"), - ) - .await? - .map(|new_ref| refs.insert(extent.clone(), new_ref)); + self.write_manifest_for_updated_chunks(&node, extent) + .await? + .map(|new_ref| refs.insert(extent.clone(), new_ref)); } Overlap::None => { // Nothing to do @@ -2105,6 +2091,63 @@ async fn fetch_manifest( Ok(asset_manager.fetch_manifest(manifest_id, manifest_info.size_bytes).await?) } +/// Map the iterator to accumulate the extents of the chunks traversed +/// +/// As we are processing chunks to create a manifest, we need to keep track +/// of the extents of the manifests. This means, for each coordinate, we need +/// to record its minimum and maximum values. +/// +/// This very ugly code does that, without having to traverse the iterator twice. +/// It adapts the stream using [`StreamExt::map_ok`] and keeps a running min/max +/// for each coordinate. +/// +/// When the iterator is fully traversed, the min and max values will be +/// available in `from` and `to` arguments. +/// +/// Yes, this is horrible. +fn aggregate_extents<'a, T: std::fmt::Debug, E>( + from: &'a mut Vec, + to: &'a mut Vec, + it: impl Stream> + 'a, + extract_index: impl for<'b> Fn(&'b T) -> &'b ChunkIndices + 'a, +) -> impl Stream> + 'a { + // we initialize the destination with an empty array, because we don't know + // the dimensions of the array yet. On the first element we will re-initialize + *from = Vec::new(); + *to = Vec::new(); + it.map_ok(move |t| { + // these are the coordinates for the chunk + let idx = extract_index(&t); + + // we need to initialize the mins/maxes the first time + // we initialize with the value of the first element + // this obviously doesn't work for empty streams + // but we never generate manifests for them + if from.is_empty() { + *from = idx.0.clone(); + // important to remember that `to` is not inclusive, so we need +1 + *to = idx.0.iter().map(|n| n + 1).collect(); + } else { + // We need to iterate over coordinates, and update the + // minimum and maximum for each if needed + for (coord_idx, value) in idx.0.iter().enumerate() { + if let Some(from_current) = from.get_mut(coord_idx) { + if value < from_current { + *from_current = *value + } + } + if let Some(to_current) = to.get_mut(coord_idx) { + let range_value = value + 1; + if range_value > *to_current { + *to_current = range_value + } + } + } + } + t + }) +} + #[cfg(test)] #[allow(clippy::panic, clippy::unwrap_used, clippy::expect_used)] mod tests { @@ -2125,8 +2168,8 @@ mod tests { repository::VersionInfo, storage::new_in_memory_storage, strategies::{ - ShapeDim, empty_writable_session, manifest_extents, node_paths, - shapes_and_dims, + ShapeDim, chunk_indices, empty_writable_session, manifest_extents, + node_paths, shapes_and_dims, }, }; @@ -2416,7 +2459,6 @@ mod tests { assert_eq!(e1.intersection(&e2), None); assert_eq!(e1.union(&e2), union); - let e1 = ManifestExtents::new( vec![0u32, 1, 2].as_slice(), vec![2u32, 4, 6].as_slice(), @@ -2429,13 +2471,48 @@ mod tests { assert_eq!(overlaps(&e2, &e1), Overlap::None); // this should create non-overlapping extents - let splits = ManifestSplits::from_edges(vec![vec![0, 10, 20], vec![0, 1, 2], vec![0, 21, 22]]); + let splits = ManifestSplits::from_edges(vec![ + vec![0, 10, 20], + vec![0, 1, 2], + vec![0, 21, 22], + ]); for vec in splits.iter().combinations(2) { assert_eq!(overlaps(vec[0], vec[1]), Overlap::None) } Ok(()) } + #[proptest(async = "tokio")] + async fn test_aggregate_extents( + #[strategy(proptest::collection::vec(chunk_indices(3, 0..1_000_000), 1..50))] + indices: Vec, + ) { + let mut from = vec![]; + let mut to = vec![]; + + let expected_from = vec![ + indices.iter().map(|i| i.0[0]).min().unwrap(), + indices.iter().map(|i| i.0[1]).min().unwrap(), + indices.iter().map(|i| i.0[2]).min().unwrap(), + ]; + let expected_to = vec![ + indices.iter().map(|i| i.0[0]).max().unwrap() + 1, + indices.iter().map(|i| i.0[1]).max().unwrap() + 1, + indices.iter().map(|i| i.0[2]).max().unwrap() + 1, + ]; + + let _ = aggregate_extents( + &mut from, + &mut to, + stream::iter(indices.into_iter().map(Ok::)), + |idx| idx, + ) + .count() + .await; + + prop_assert_eq!(from, expected_from); + prop_assert_eq!(to, expected_to); + } #[tokio::test] async fn test_which_split() -> Result<(), Box> { From 8924a39c35de182127b65b08bd782fb616c5ad0b Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 6 Jun 2025 16:37:19 -0600 Subject: [PATCH 04/43] Tests pass! --- icechunk/src/change_set.rs | 6 +----- icechunk/src/session.rs | 44 ++++++++++++++++---------------------- 2 files changed, 19 insertions(+), 31 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index ae5fc45a5..31af358a6 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -171,7 +171,7 @@ impl ChangeSet { // this implementation makes delete idempotent // it allows deleting a deleted chunk by repeatedly setting None. self.set_chunks - .entry(node_id) + .entry(node_id.clone()) .and_modify(|h| { h.entry(extent.clone()).or_default().insert(coord.clone(), data.clone()); }) @@ -335,9 +335,6 @@ impl ChangeSet { /// Results of the merge are applied to `self`. Changes present in `other` take precedence over /// `self` changes. pub fn merge(&mut self, other: ChangeSet) { - // FIXME: what do I do with splitting and splits here. - // FIXME: this should detect conflict, for example, if different writers added on the same - // path, different objects, or if the same path is added and deleted, etc. // TODO: optimize self.new_groups.extend(other.new_groups); self.new_arrays.extend(other.new_arrays); @@ -345,7 +342,6 @@ impl ChangeSet { self.updated_arrays.extend(other.updated_arrays); self.deleted_groups.extend(other.deleted_groups); self.deleted_arrays.extend(other.deleted_arrays); - // FIXME: handle splits for (node, other_chunks) in other.set_chunks.into_iter() { match self.set_chunks.remove(&node) { diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 2ed81e661..3b5b9f66d 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -465,7 +465,10 @@ impl Session { user_data: Bytes, ) -> SessionResult<()> { self.get_array(path).await.map(|node| { - self.cache_splits(&node.id, path, &shape, &dimension_names); + // Q: What happens if we set a chunk, then change a dimension name, so + // that the split changes. + // A: We ignore it. splits are set once for a node in a session, and are never changed. + // self.cache_splits(&node.id, path, &shape, &dimension_names); self.change_set.update_array( &node.id, path, @@ -538,9 +541,6 @@ impl Session { dimension_names: &Option>, ) { let splitting = self.config.manifest().splitting(); - // Q: What happens if we set a chunk, then change a dimension name, so - // that the split changes. - // A: We ignore it. splits are set once for a node in a session, and are never changed. let splits = splitting.get_split_sizes(path, shape, dimension_names); self.splits.insert(node_id.clone(), splits); } @@ -974,7 +974,6 @@ impl Session { branch_name, &self.snapshot_id, &self.change_set, - &self.config, message, Some(properties), &self.splits, @@ -998,7 +997,6 @@ impl Session { branch_name, &self.snapshot_id, &self.change_set, - &self.config, message, Some(properties), &self.splits, @@ -1291,6 +1289,7 @@ async fn verified_node_chunk_iterator<'a>( let new_chunks = change_set .array_chunks_iterator(&node.id, &node.path, extent.clone()) .filter_map(move |(idx, payload)| { + dbg!("iterating through ", &idx, &payload); payload.as_ref().map(|payload| { Ok(ChunkInfo { node: node_id_c.clone(), @@ -1576,7 +1575,6 @@ struct FlushProcess<'a> { asset_manager: Arc, change_set: &'a ChangeSet, parent_id: &'a SnapshotId, - config: &'a RepositoryConfig, splits: &'a HashMap, manifest_refs: HashMap>, manifest_files: HashSet, @@ -1587,14 +1585,12 @@ impl<'a> FlushProcess<'a> { asset_manager: Arc, change_set: &'a ChangeSet, parent_id: &'a SnapshotId, - config: &'a RepositoryConfig, splits: &'a HashMap, ) -> Self { Self { asset_manager, change_set, parent_id, - config, splits, manifest_refs: Default::default(), manifest_files: Default::default(), @@ -1636,6 +1632,7 @@ impl<'a> FlushProcess<'a> { ManifestFileInfo::new(new_manifest.as_ref(), new_manifest_size); self.manifest_files.insert(file_info); + dbg!(&from, &to); let new_ref = ManifestRef { object_id: new_manifest.id().clone(), extents: ManifestExtents::new(&from, &to), @@ -1695,10 +1692,13 @@ impl<'a> FlushProcess<'a> { async fn write_manifest_for_existing_node( &mut self, node: &NodeSnapshot, - splits: ManifestSplits, manifests: Vec, ) -> SessionResult<()> { - // populate with existing refs, if they are compatiblae + let splits = self + .splits + .get(&node.id) + .expect(&format!("splits should exist for this node {}", node.id.clone())); + // populate with existing refs, if they are compatible let mut refs = HashMap::::with_capacity(splits.len()); @@ -1877,7 +1877,6 @@ async fn flush( let old_snapshot = flush_data.asset_manager.fetch_snapshot(flush_data.parent_id).await?; - let splitting_config = flush_data.config.manifest().splitting(); // We first go through all existing nodes to see if we need to rewrite any manifests @@ -1903,17 +1902,8 @@ async fn flush( &node.path, ) .await?; - if let NodeData::Array { shape, dimension_names, manifests } = - new_node.node_data - { - let splits = splitting_config.get_split_sizes( - &new_node.path, - &shape, - &dimension_names, - ); - flush_data - .write_manifest_for_existing_node(&node, splits, manifests) - .await?; + if let NodeData::Array { manifests, .. } = new_node.node_data { + flush_data.write_manifest_for_existing_node(&node, manifests).await?; } } else { trace!(path=%node.path, "Node has no changes, keeping the previous manifest"); @@ -2041,7 +2031,6 @@ async fn do_commit( branch_name: &str, snapshot_id: &SnapshotId, change_set: &ChangeSet, - config: &RepositoryConfig, message: &str, properties: Option, splits: &HashMap, @@ -2049,8 +2038,7 @@ async fn do_commit( info!(branch_name, old_snapshot_id=%snapshot_id, "Commit started"); let parent_snapshot = snapshot_id.clone(); let properties = properties.unwrap_or_default(); - let flush_data = - FlushProcess::new(asset_manager, change_set, snapshot_id, config, splits); + let flush_data = FlushProcess::new(asset_manager, change_set, snapshot_id, splits); let new_snapshot = flush(flush_data, message, properties).await?; debug!(branch_name, new_snapshot_id=%new_snapshot, "Updating branch"); @@ -2119,6 +2107,8 @@ fn aggregate_extents<'a, T: std::fmt::Debug, E>( // these are the coordinates for the chunk let idx = extract_index(&t); + dbg!("processing index ", &idx); + // we need to initialize the mins/maxes the first time // we initialize with the value of the first element // this obviously doesn't work for empty streams @@ -2979,6 +2969,8 @@ mod tests { Some(ChunkPayload::Inline("new chunk".into())) ); + dbg!("deleting chunk"); + // we delete a chunk ds.set_chunk_ref(new_array_path.clone(), ChunkIndices(vec![0, 0, 1]), None) .await?; From ca2d1713a92c5134011b9aa07e34fa1e6507e3b8 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 6 Jun 2025 16:43:27 -0600 Subject: [PATCH 05/43] sqw iterator --- icechunk/src/change_set.rs | 25 ++++--------------------- icechunk/src/session.rs | 11 ++++++++--- 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index 31af358a6..f6e882e80 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -244,7 +244,8 @@ impl ChangeSet { &self, ) -> impl Iterator + use<'_> { self.new_arrays.iter().flat_map(|(path, (node_id, _))| { - self.new_array_chunk_iterator(node_id, path).map(|ci| (path.clone(), ci)) + self.new_array_chunk_iterator(node_id, path, None) + .map(|ci| (path.clone(), ci)) }) } @@ -252,8 +253,9 @@ impl ChangeSet { &'a self, node_id: &'a NodeId, node_path: &Path, + extent: Option, ) -> impl Iterator + use<'a> { - self.array_chunks_iterator(node_id, node_path, None).filter_map( + self.array_chunks_iterator(node_id, node_path, extent).filter_map( move |(coords, payload)| { payload.as_ref().map(|p| ChunkInfo { node: node_id.clone(), @@ -286,25 +288,6 @@ impl ChangeSet { self.set_chunks.get(node_id).and_then(|x| x.get(extent)) } - /// Iterator over chunks for a new array for a given ManifestExtents - pub fn new_array_manifest_chunks_iterator<'a>( - &'a self, - node_id: &'a NodeId, - extent: &ManifestExtents, - ) -> impl Iterator + use<'a> { - if let Some(manifest) = self.array_manifest(node_id, extent) { - Either::Right(manifest.iter().filter_map(move |(coords, payload)| { - payload.as_ref().map(|p| ChunkInfo { - node: node_id.clone(), - coord: coords.clone(), - payload: p.clone(), - }) - })) - } else { - Either::Left(iter::empty()) - } - } - pub fn new_nodes(&self) -> impl Iterator { self.new_groups().chain(self.new_arrays()) } diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 3b5b9f66d..21afc6b87 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -897,7 +897,7 @@ impl Session { let res = try_stream! { let new_chunks = stream::iter( self.change_set - .new_array_chunk_iterator(&node.id, array_path) + .new_array_chunk_iterator(&node.id, array_path, None) .map(|chunk_info| Ok::(chunk_info.coord)), ); @@ -1661,6 +1661,7 @@ impl<'a> FlushProcess<'a> { async fn write_manifest_for_new_node( &mut self, node_id: &NodeId, + node_path: &Path, ) -> SessionResult<()> { let splits = self.splits.get(node_id).expect(&format!( "getting split for node {} unexpectedly failed", @@ -1675,7 +1676,11 @@ impl<'a> FlushProcess<'a> { if self.change_set.array_manifest(node_id, extent).is_some() { let chunks = stream::iter( self.change_set - .new_array_manifest_chunks_iterator(node_id, extent) + .new_array_chunk_iterator( + node_id, + node_path, + Some(extent.clone()), + ) .map(Ok), ); self.write_manifest_from_iterator(chunks) @@ -1918,7 +1923,7 @@ async fn flush( for (node_path, node_id) in flush_data.change_set.new_arrays() { trace!(path=%node_path, "New node, writing a manifest"); - flush_data.write_manifest_for_new_node(node_id).await?; + flush_data.write_manifest_for_new_node(node_id, node_path).await?; } trace!("Building new snapshot"); From 5e843be6fb37d7eb776d5ff359968b143f082e27 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 6 Jun 2025 16:45:58 -0600 Subject: [PATCH 06/43] cleanup --- icechunk/src/session.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 21afc6b87..16cd43239 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -1289,7 +1289,6 @@ async fn verified_node_chunk_iterator<'a>( let new_chunks = change_set .array_chunks_iterator(&node.id, &node.path, extent.clone()) .filter_map(move |(idx, payload)| { - dbg!("iterating through ", &idx, &payload); payload.as_ref().map(|payload| { Ok(ChunkInfo { node: node_id_c.clone(), @@ -1632,7 +1631,6 @@ impl<'a> FlushProcess<'a> { ManifestFileInfo::new(new_manifest.as_ref(), new_manifest_size); self.manifest_files.insert(file_info); - dbg!(&from, &to); let new_ref = ManifestRef { object_id: new_manifest.id().clone(), extents: ManifestExtents::new(&from, &to), @@ -2112,8 +2110,6 @@ fn aggregate_extents<'a, T: std::fmt::Debug, E>( // these are the coordinates for the chunk let idx = extract_index(&t); - dbg!("processing index ", &idx); - // we need to initialize the mins/maxes the first time // we initialize with the value of the first element // this obviously doesn't work for empty streams @@ -2974,8 +2970,6 @@ mod tests { Some(ChunkPayload::Inline("new chunk".into())) ); - dbg!("deleting chunk"); - // we delete a chunk ds.set_chunk_ref(new_array_path.clone(), ChunkIndices(vec![0, 0, 1]), None) .await?; From 03d62accc31885bae1d1d06f06c41717406d3184 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 6 Jun 2025 17:03:35 -0600 Subject: [PATCH 07/43] lint --- icechunk/src/change_set.rs | 10 +++++----- icechunk/src/session.rs | 33 ++++++++++++++++++--------------- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index f6e882e80..87c0e124e 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -69,7 +69,7 @@ impl ChangeSet { } pub fn arrays_with_chunk_changes(&self) -> impl Iterator { - self.set_chunks.iter().map(|(node, _)| node) + self.set_chunks.keys() } pub fn is_empty(&self) -> bool { @@ -167,6 +167,7 @@ impl ChangeSet { data: Option, splits: &ManifestSplits, ) { + #[allow(clippy::expect_used)] let (_, extent) = splits.which_extent_and_index(&coord).expect("logic bug. Trying to set chunk ref but can't find the appropriate split manifest."); // this implementation makes delete idempotent // it allows deleting a deleted chunk by repeatedly setting None. @@ -195,10 +196,9 @@ impl ChangeSet { coords: &ChunkIndices, ) -> Option<&Option> { if let Some(node_chunks) = self.set_chunks.get(node_id) { - which_extent_and_index(node_chunks.keys(), coords) - .ok() - .map(|(_, extent)| node_chunks.get(&extent).and_then(|s| s.get(coords))) - .flatten() + which_extent_and_index(node_chunks.keys(), coords).ok().and_then( + |(_, extent)| node_chunks.get(&extent).and_then(|s| s.get(coords)), + ) } else { None } diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 16cd43239..dfa91d662 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -229,11 +229,11 @@ pub fn overlaps(us: &ManifestExtents, them: &ManifestExtents) -> Overlap { } if overlaps.iter().all(|x| x == &Overlap::Complete) { - return Overlap::Complete; + Overlap::Complete } else if overlaps.iter().any(|x| x == &Overlap::None) { - return Overlap::None; + Overlap::None } else { - return Overlap::Partial; + Overlap::Partial } } @@ -555,6 +555,7 @@ impl Session { if !self.splits.contains_key(node_id) { self.cache_splits(node_id, path, shape, dimension_names); } + #[allow(clippy::expect_used)] self.splits.get(node_id).expect("should not be possible.") } @@ -1303,7 +1304,7 @@ async fn verified_node_chunk_iterator<'a>( futures::stream::iter(manifests) .filter(move |manifest_ref| { futures::future::ready(extent.as_ref().is_none_or(|e| { - overlaps(&manifest_ref.extents, &e) != Overlap::None + overlaps(&manifest_ref.extents, e) != Overlap::None })) }) .then(move |manifest_ref| { @@ -1622,7 +1623,11 @@ impl<'a> FlushProcess<'a> { let mut to = vec![]; let chunks = aggregate_extents(&mut from, &mut to, chunks, |ci| &ci.coord); - if let Some(new_manifest) = Manifest::from_stream(chunks).await.unwrap() { + #[allow(clippy::expect_used)] + if let Some(new_manifest) = Manifest::from_stream(chunks) + .await + .expect("failed to create manifest from chunk stream") + { let new_manifest = Arc::new(new_manifest); let new_manifest_size = self.asset_manager.write_manifest(Arc::clone(&new_manifest)).await?; @@ -1661,10 +1666,9 @@ impl<'a> FlushProcess<'a> { node_id: &NodeId, node_path: &Path, ) -> SessionResult<()> { - let splits = self.splits.get(node_id).expect(&format!( - "getting split for node {} unexpectedly failed", - node_id.clone() - )); + #[allow(clippy::expect_used)] + let splits = + self.splits.get(node_id).expect("getting split for node unexpectedly failed"); let mut refs = HashMap::::with_capacity(splits.len()); @@ -1697,10 +1701,9 @@ impl<'a> FlushProcess<'a> { node: &NodeSnapshot, manifests: Vec, ) -> SessionResult<()> { - let splits = self - .splits - .get(&node.id) - .expect(&format!("splits should exist for this node {}", node.id.clone())); + #[allow(clippy::expect_used)] + let splits = + self.splits.get(&node.id).expect("splits should exist for this node."); // populate with existing refs, if they are compatible let mut refs = HashMap::::with_capacity(splits.len()); @@ -1722,7 +1725,7 @@ impl<'a> FlushProcess<'a> { for extent in splits.iter() { if modified_splits.contains(extent) { // this split was modified in this session, rewrite it completely - self.write_manifest_for_updated_chunks(&node, extent) + self.write_manifest_for_updated_chunks(node, extent) .await? .map(|new_ref| refs.insert(extent.clone(), new_ref)); } else { @@ -1746,7 +1749,7 @@ impl<'a> FlushProcess<'a> { // the splits have changed, but no refs in this split have been written in this session // same as `if` block above debug_assert!(on_disk_bbox.is_some()); - self.write_manifest_for_updated_chunks(&node, extent) + self.write_manifest_for_updated_chunks(node, extent) .await? .map(|new_ref| refs.insert(extent.clone(), new_ref)); } From ef37e5813a40ff41d29fdb1b5dd4301ff93e6253 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 6 Jun 2025 22:41:25 -0600 Subject: [PATCH 08/43] Fix bug --- icechunk-python/tests/test_manifest_splitting.py | 2 +- icechunk/src/format/mod.rs | 2 +- icechunk/src/session.rs | 14 +++++++++++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/icechunk-python/tests/test_manifest_splitting.py b/icechunk-python/tests/test_manifest_splitting.py index b29fb7b18..479cecd33 100644 --- a/icechunk-python/tests/test_manifest_splitting.py +++ b/icechunk-python/tests/test_manifest_splitting.py @@ -102,7 +102,7 @@ def test_manifest_splitting_appends(): nchunks += math.prod(NEWSHAPE) * 2 # the lon size goes from 17 -> 19 so one extra manifest, # compared to previous writes - nmanifests += 7 * 2 + nmanifests += 2 * 2 assert len(os.listdir(f"{tmpdir}/chunks")) == nchunks assert len(os.listdir(f"{tmpdir}/manifests")) == nmanifests diff --git a/icechunk/src/format/mod.rs b/icechunk/src/format/mod.rs index 58f1ebc7c..557c0b778 100644 --- a/icechunk/src/format/mod.rs +++ b/icechunk/src/format/mod.rs @@ -244,7 +244,7 @@ pub enum IcechunkFormatErrorKind { NodeNotFound { path: Path }, #[error("chunk coordinates not found `{coords:?}`")] ChunkCoordinatesNotFound { coords: ChunkIndices }, - #[error("manifest information cannot be found in snapshot `{manifest_id}`")] + #[error("manifest information cannot be found in snapshot for id `{manifest_id}`")] ManifestInfoNotFound { manifest_id: ManifestId }, #[error("invalid magic numbers in file")] InvalidMagicNumbers, // TODO: add more info diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index dfa91d662..fe27f1e41 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -1700,6 +1700,7 @@ impl<'a> FlushProcess<'a> { &mut self, node: &NodeSnapshot, manifests: Vec, + old_snapshot: &Snapshot, ) -> SessionResult<()> { #[allow(clippy::expect_used)] let splits = @@ -1744,6 +1745,11 @@ impl<'a> FlushProcess<'a> { debug_assert!(on_disk_bbox.is_some()); // Just propagate this ref again, no rewriting necessary refs.insert(extent.clone(), old_ref.clone()); + // OK to unwrap here since this manifest file must exist in the old snapshot + #[allow(clippy::expect_used)] + self.manifest_files.insert( + old_snapshot.manifest_info(&old_ref.object_id).expect("logic bug. creating manifest file info for an existing manifest failed."), + ); } Overlap::Partial => { // the splits have changed, but no refs in this split have been written in this session @@ -1909,7 +1915,13 @@ async fn flush( ) .await?; if let NodeData::Array { manifests, .. } = new_node.node_data { - flush_data.write_manifest_for_existing_node(&node, manifests).await?; + flush_data + .write_manifest_for_existing_node( + &node, + manifests, + old_snapshot.as_ref(), + ) + .await?; } } else { trace!(path=%node.path, "Node has no changes, keeping the previous manifest"); From 9fdd0d3ec53e172fc96c175c86b1aba844152863 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 9 Jun 2025 13:37:44 -0600 Subject: [PATCH 09/43] Minor changes --- icechunk/proptest-regressions/session.txt | 2 + icechunk/src/change_set.rs | 2 +- icechunk/src/conflicts/detector.rs | 2 +- icechunk/src/format/manifest.rs | 7 +-- icechunk/src/format/transaction_log.rs | 2 +- icechunk/src/session.rs | 56 ++++++++++++++++------- icechunk/src/strategies.rs | 2 +- 7 files changed, 50 insertions(+), 23 deletions(-) diff --git a/icechunk/proptest-regressions/session.txt b/icechunk/proptest-regressions/session.txt index ce82f8b5d..f938988b2 100644 --- a/icechunk/proptest-regressions/session.txt +++ b/icechunk/proptest-regressions/session.txt @@ -8,3 +8,5 @@ cc da94eced751096504c0803bed6ad66cde255567c3cf6c0b316cce66c22e3142a # shrinks to cc b0a66d6fdd012c51dd804b9f6c58e4403b2dd41f15c3856adc8d90e3d42311fc # shrinks to (initial_state, transitions, seen_counter) = (RepositoryModel { arrays: {}, groups: [] }, [AddArray(Path(Utf8PathBuf { _encoding: "unix", inner: "/" }), ZarrArrayMetadata { shape: [1], data_type: Bool, chunk_shape: ChunkShape([1]), chunk_key_encoding: Slash, fill_value: Bool(false), codecs: [Codec { name: "mycodec", configuration: None }], storage_transformers: Some([StorageTransformer { name: "mytransformer", configuration: None }]), dimension_names: None })], None) cc 4f7049d25e420db7b98fcadb0fe6bc7576d3bbc6eb3b971074e6f7257282d040 # shrinks to input = _TestAddDeleteArrayArgs { path: Path(Utf8PathBuf { _encoding: "unix", inner: "/" }), metadata: ZarrArrayMetadata { shape: [1], data_type: Bool, chunk_shape: ChunkShape([1]), chunk_key_encoding: Slash, fill_value: Bool(false), codecs: [Codec { name: "mycodec", configuration: None }], storage_transformers: Some([StorageTransformer { name: "mytransformer", configuration: None }]), dimension_names: None }, session: Session { config: RepositoryConfig { inline_chunk_threshold_bytes: 512, unsafe_overwrite_refs: false, get_partial_values_concurrency: 10, compression: CompressionConfig { algorithm: Zstd, level: 1 }, caching: CachingConfig { snapshots_cache_size: 2, manifests_cache_size: 2, transactions_cache_size: 0, attributes_cache_size: 2, chunks_cache_size: 0 }, storage: None, virtual_chunk_containers: {"file": VirtualChunkContainer { name: "file", url_prefix: "file", store: LocalFileSystem("") }, "tigris": VirtualChunkContainer { name: "tigris", url_prefix: "tigris", store: Tigris }, "s3": VirtualChunkContainer { name: "s3", url_prefix: "s3", store: S3(S3Options { region: None, endpoint_url: None, anonymous: false, allow_http: false }) }, "gcs": VirtualChunkContainer { name: "gcs", url_prefix: "gcs", store: Gcs({}) }, "az": VirtualChunkContainer { name: "az", url_prefix: "az", store: Azure }} }, storage_settings: Settings { concurrency: ConcurrencySettings { max_concurrent_requests_for_object: 5, min_concurrent_request_size: 1 } }, storage: ObjectStorage { config: ObjectStorageConfig { url: "memory:/", prefix: "", options: [] }, store: InMemory { storage: RwLock { data: Storage { next_etag: 3, map: {Path { raw: "config.yaml" }: Entry { data: b"inline_chunk_threshold_bytes: 512\nunsafe_overwrite_refs: false\nget_partial_values_concurrency: 10\ncompression:\n algorithm: Zstd\n level: 1\ncaching:\n snapshots_cache_size: 2\n manifests_cache_size: 2\n transactions_cache_size: 0\n attributes_cache_size: 2\n chunks_cache_size: 0\nstorage: null\nvirtual_chunk_containers:\n file:\n name: file\n url_prefix: file\n store: !LocalFileSystem ''\n tigris:\n name: tigris\n url_prefix: tigris\n store: !Tigris {}\n s3:\n name: s3\n url_prefix: s3\n store: !S3\n region: null\n endpoint_url: null\n anonymous: false\n allow_http: false\n gcs:\n name: gcs\n url_prefix: gcs\n store: !Gcs {}\n az:\n name: az\n url_prefix: az\n store: !Azure {}\n", last_modified: 2025-01-08T15:41:23.520617823Z, attributes: Attributes({ContentType: AttributeValue("application/yaml")}), e_tag: 2 }, Path { raw: "refs/branch.main/ZZZZZZZZ.json" }: Entry { data: b"{\"snapshot\":\"6EPXF9PX9JPEY8F4Q9KG\"}", last_modified: 2025-01-08T15:41:23.520470841Z, attributes: Attributes({}), e_tag: 1 }, Path { raw: "snapshots/6EPXF9PX9JPEY8F4Q9KG" }: Entry { data: b"ICE\xf0\x9f\xa7\x8aCHUNKic- \x01\x02\x01(\xb5/\xfd\0H\x1d\x03\0\xb4\x05\x9b\0\x80\x90\x90\0\0\x90\x93\xb4QJ6QBBJ59Y0VSRKHH1GG\xbe2025-01-08T15:41:23.520187204Z\xb6Repository initialized9404Z\x80\x80\x01\0T\xb6U\x14", last_modified: 2025-01-08T15:41:23.520418067Z, attributes: Attributes({Metadata("ic-file-type"): AttributeValue("manifest"), Metadata("ic-spec-ver"): AttributeValue("1"), Metadata("ic-comp-alg"): AttributeValue("zstd"), Metadata("ic-"): AttributeValue("ic-client")}), e_tag: 0 }}, uploads: {} } } } }, asset_resolver: AssetResolver { storage: ObjectStorage { config: ObjectStorageConfig { url: "memory:/", prefix: "", options: [] }, store: InMemory { storage: RwLock { data: Storage { next_etag: 3, map: {Path { raw: "config.yaml" }: Entry { data: b"inline_chunk_threshold_bytes: 512\nunsafe_overwrite_refs: false\nget_partial_values_concurrency: 10\ncompression:\n algorithm: Zstd\n level: 1\ncaching:\n snapshots_cache_size: 2\n manifests_cache_size: 2\n transactions_cache_size: 0\n attributes_cache_size: 2\n chunks_cache_size: 0\nstorage: null\nvirtual_chunk_containers:\n file:\n name: file\n url_prefix: file\n store: !LocalFileSystem ''\n tigris:\n name: tigris\n url_prefix: tigris\n store: !Tigris {}\n s3:\n name: s3\n url_prefix: s3\n store: !S3\n region: null\n endpoint_url: null\n anonymous: false\n allow_http: false\n gcs:\n name: gcs\n url_prefix: gcs\n store: !Gcs {}\n az:\n name: az\n url_prefix: az\n store: !Azure {}\n", last_modified: 2025-01-08T15:41:23.520617823Z, attributes: Attributes({ContentType: AttributeValue("application/yaml")}), e_tag: 2 }, Path { raw: "refs/branch.main/ZZZZZZZZ.json" }: Entry { data: b"{\"snapshot\":\"6EPXF9PX9JPEY8F4Q9KG\"}", last_modified: 2025-01-08T15:41:23.520470841Z, attributes: Attributes({}), e_tag: 1 }, Path { raw: "snapshots/6EPXF9PX9JPEY8F4Q9KG" }: Entry { data: b"ICE\xf0\x9f\xa7\x8aCHUNKic- \x01\x02\x01(\xb5/\xfd\0H\x1d\x03\0\xb4\x05\x9b\0\x80\x90\x90\0\0\x90\x93\xb4QJ6QBBJ59Y0VSRKHH1GG\xbe2025-01-08T15:41:23.520187204Z\xb6Repository initialized9404Z\x80\x80\x01\0T\xb6U\x14", last_modified: 2025-01-08T15:41:23.520418067Z, attributes: Attributes({Metadata("ic-file-type"): AttributeValue("manifest"), Metadata("ic-spec-ver"): AttributeValue("1"), Metadata("ic-comp-alg"): AttributeValue("zstd"), Metadata("ic-"): AttributeValue("ic-client")}), e_tag: 0 }}, uploads: {} } } } }, storage_settings: Settings { concurrency: ConcurrencySettings { max_concurrent_requests_for_object: 5, min_concurrent_request_size: 1 } }, num_snapshots: 2, num_manifests: 2, num_transactions: 0, num_attributes: 2, num_chunks: 0, snapshot_cache: Cache { .. }, manifest_cache: Cache { .. }, transactions_cache: Cache { .. }, attributes_cache: Cache { .. }, chunk_cache: Cache { .. } }, virtual_resolver: VirtualChunkResolver { containers: [VirtualChunkContainer { name: "tigris", url_prefix: "tigris", store: Tigris }, VirtualChunkContainer { name: "file", url_prefix: "file", store: LocalFileSystem("") }, VirtualChunkContainer { name: "gcs", url_prefix: "gcs", store: Gcs({}) }, VirtualChunkContainer { name: "s3", url_prefix: "s3", store: S3(S3Options { region: None, endpoint_url: None, anonymous: false, allow_http: false }) }, VirtualChunkContainer { name: "az", url_prefix: "az", store: Azure }], credentials: {}, fetchers: RwLock { data: {} } }, branch_name: Some("main"), snapshot_id: 33add7a6dd4cacef21e4ba67, change_set: ChangeSet { new_groups: {}, new_arrays: {}, updated_arrays: {}, updated_attributes: {}, set_chunks: {}, deleted_groups: {}, deleted_arrays: {} } } } cc a4bf17e17086f3e723abccc1307f0b489a5e646e899c08e3b483d3befe15cb26 # shrinks to input = _TestAddArrayGroupClashArgs { path: Path(Utf8PathBuf { _encoding: "unix", inner: "/" }), metadata: ZarrArrayMetadata { shape: [1], data_type: Bool, chunk_shape: ChunkShape([1]), chunk_key_encoding: Slash, fill_value: Bool(false), codecs: [Codec { name: "mycodec", configuration: None }], storage_transformers: Some([StorageTransformer { name: "mytransformer", configuration: None }]), dimension_names: None }, session: Session { config: RepositoryConfig { inline_chunk_threshold_bytes: 512, unsafe_overwrite_refs: false, get_partial_values_concurrency: 10, compression: CompressionConfig { algorithm: Zstd, level: 1 }, caching: CachingConfig { snapshots_cache_size: 2, manifests_cache_size: 2, transactions_cache_size: 0, attributes_cache_size: 2, chunks_cache_size: 0 }, storage: None, virtual_chunk_containers: {"tigris": VirtualChunkContainer { name: "tigris", url_prefix: "tigris", store: Tigris }, "file": VirtualChunkContainer { name: "file", url_prefix: "file", store: LocalFileSystem("") }, "az": VirtualChunkContainer { name: "az", url_prefix: "az", store: Azure }, "s3": VirtualChunkContainer { name: "s3", url_prefix: "s3", store: S3(S3Options { region: None, endpoint_url: None, anonymous: false, allow_http: false }) }, "gcs": VirtualChunkContainer { name: "gcs", url_prefix: "gcs", store: Gcs({}) }} }, storage_settings: Settings { concurrency: ConcurrencySettings { max_concurrent_requests_for_object: 5, min_concurrent_request_size: 1 } }, storage: ObjectStorage { config: ObjectStorageConfig { url: "memory:/", prefix: "", options: [] }, store: InMemory { storage: RwLock { data: Storage { next_etag: 3, map: {Path { raw: "config.yaml" }: Entry { data: b"inline_chunk_threshold_bytes: 512\nunsafe_overwrite_refs: false\nget_partial_values_concurrency: 10\ncompression:\n algorithm: Zstd\n level: 1\ncaching:\n snapshots_cache_size: 2\n manifests_cache_size: 2\n transactions_cache_size: 0\n attributes_cache_size: 2\n chunks_cache_size: 0\nstorage: null\nvirtual_chunk_containers:\n tigris:\n name: tigris\n url_prefix: tigris\n store: !Tigris {}\n file:\n name: file\n url_prefix: file\n store: !LocalFileSystem ''\n az:\n name: az\n url_prefix: az\n store: !Azure {}\n s3:\n name: s3\n url_prefix: s3\n store: !S3\n region: null\n endpoint_url: null\n anonymous: false\n allow_http: false\n gcs:\n name: gcs\n url_prefix: gcs\n store: !Gcs {}\n", last_modified: 2025-01-08T15:41:23.635133963Z, attributes: Attributes({ContentType: AttributeValue("application/yaml")}), e_tag: 2 }, Path { raw: "refs/branch.main/ZZZZZZZZ.json" }: Entry { data: b"{\"snapshot\":\"2T8MSGGT2FVZBS6094K0\"}", last_modified: 2025-01-08T15:41:23.635089347Z, attributes: Attributes({}), e_tag: 1 }, Path { raw: "snapshots/2T8MSGGT2FVZBS6094K0" }: Entry { data: b"ICE\xf0\x9f\xa7\x8aCHUNKic- \x01\x02\x01(\xb5/\xfd\0H\x1d\x03\0\xb4\x05\x9b\0\x80\x90\x90\0\0\x90\x93\xb47BZ1BYSVSA8MSS4WB5JG\xbe2025-01-08T15:41:23.634873394Z\xb6Repository initialized5576Z\x80\x80\x01\0T\xb6U\x14", last_modified: 2025-01-08T15:41:23.635043198Z, attributes: Attributes({Metadata("ic-comp-alg"): AttributeValue("zstd"), Metadata("ic-file-type"): AttributeValue("manifest"), Metadata("ic-spec-ver"): AttributeValue("1"), Metadata("ic-"): AttributeValue("ic-client")}), e_tag: 0 }}, uploads: {} } } } }, asset_resolver: AssetResolver { storage: ObjectStorage { config: ObjectStorageConfig { url: "memory:/", prefix: "", options: [] }, store: InMemory { storage: RwLock { data: Storage { next_etag: 3, map: {Path { raw: "config.yaml" }: Entry { data: b"inline_chunk_threshold_bytes: 512\nunsafe_overwrite_refs: false\nget_partial_values_concurrency: 10\ncompression:\n algorithm: Zstd\n level: 1\ncaching:\n snapshots_cache_size: 2\n manifests_cache_size: 2\n transactions_cache_size: 0\n attributes_cache_size: 2\n chunks_cache_size: 0\nstorage: null\nvirtual_chunk_containers:\n tigris:\n name: tigris\n url_prefix: tigris\n store: !Tigris {}\n file:\n name: file\n url_prefix: file\n store: !LocalFileSystem ''\n az:\n name: az\n url_prefix: az\n store: !Azure {}\n s3:\n name: s3\n url_prefix: s3\n store: !S3\n region: null\n endpoint_url: null\n anonymous: false\n allow_http: false\n gcs:\n name: gcs\n url_prefix: gcs\n store: !Gcs {}\n", last_modified: 2025-01-08T15:41:23.635133963Z, attributes: Attributes({ContentType: AttributeValue("application/yaml")}), e_tag: 2 }, Path { raw: "refs/branch.main/ZZZZZZZZ.json" }: Entry { data: b"{\"snapshot\":\"2T8MSGGT2FVZBS6094K0\"}", last_modified: 2025-01-08T15:41:23.635089347Z, attributes: Attributes({}), e_tag: 1 }, Path { raw: "snapshots/2T8MSGGT2FVZBS6094K0" }: Entry { data: b"ICE\xf0\x9f\xa7\x8aCHUNKic- \x01\x02\x01(\xb5/\xfd\0H\x1d\x03\0\xb4\x05\x9b\0\x80\x90\x90\0\0\x90\x93\xb47BZ1BYSVSA8MSS4WB5JG\xbe2025-01-08T15:41:23.634873394Z\xb6Repository initialized5576Z\x80\x80\x01\0T\xb6U\x14", last_modified: 2025-01-08T15:41:23.635043198Z, attributes: Attributes({Metadata("ic-comp-alg"): AttributeValue("zstd"), Metadata("ic-file-type"): AttributeValue("manifest"), Metadata("ic-spec-ver"): AttributeValue("1"), Metadata("ic-"): AttributeValue("ic-client")}), e_tag: 0 }}, uploads: {} } } } }, storage_settings: Settings { concurrency: ConcurrencySettings { max_concurrent_requests_for_object: 5, min_concurrent_request_size: 1 } }, num_snapshots: 2, num_manifests: 2, num_transactions: 0, num_attributes: 2, num_chunks: 0, snapshot_cache: Cache { .. }, manifest_cache: Cache { .. }, transactions_cache: Cache { .. }, attributes_cache: Cache { .. }, chunk_cache: Cache { .. } }, virtual_resolver: VirtualChunkResolver { containers: [VirtualChunkContainer { name: "tigris", url_prefix: "tigris", store: Tigris }, VirtualChunkContainer { name: "file", url_prefix: "file", store: LocalFileSystem("") }, VirtualChunkContainer { name: "gcs", url_prefix: "gcs", store: Gcs({}) }, VirtualChunkContainer { name: "az", url_prefix: "az", store: Azure }, VirtualChunkContainer { name: "s3", url_prefix: "s3", store: S3(S3Options { region: None, endpoint_url: None, anonymous: false, allow_http: false }) }], credentials: {}, fetchers: RwLock { data: {} } }, branch_name: Some("main"), snapshot_id: 16914cc21a13f7f5e4c04926, change_set: ChangeSet { new_groups: {}, new_arrays: {}, updated_arrays: {}, updated_attributes: {}, set_chunks: {}, deleted_groups: {}, deleted_arrays: {} } } } +cc 60be8007bfbb770a1a404577dde928290d9018a6c7ca8240e9f3e73922be174c # shrinks to input = _TestPropertyExtentsSetOpsArgs { e1: ManifestExtents([0..0, 0..0, 0..0, 0..0]), e2: ManifestExtents([0..0, 0..0, 0..0, 0..0]) } +cc 2ae68e9977c83f4cb719f54d9b37e9e54a660c4b5ec826e1ac34696d4ab0bf4e # shrinks to input = _TestPropertyExtentsSetOpsSameArgs { e: ManifestExtents([0..0, 0..0, 0..0, 0..0]) } diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index 87c0e124e..302d1d7fb 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -56,7 +56,7 @@ impl ChangeSet { self.deleted_arrays.contains(path_and_id) } - pub fn chunk_changes( + pub fn changed_chunks( &self, ) -> impl Iterator)> { self.set_chunks.iter().map(|(node_id, split_map)| { diff --git a/icechunk/src/conflicts/detector.rs b/icechunk/src/conflicts/detector.rs index d2c4cef24..fc3ab7214 100644 --- a/icechunk/src/conflicts/detector.rs +++ b/icechunk/src/conflicts/detector.rs @@ -138,7 +138,7 @@ impl ConflictSolver for ConflictDetector { }); let chunks_double_updated = - current_changes.chunk_changes().filter_map(|(node_id, changes)| { + current_changes.changed_chunks().filter_map(|(node_id, changes)| { let previous_changes: HashSet<_> = previous_change.updated_chunks_for(node_id).collect(); if previous_changes.is_empty() { diff --git a/icechunk/src/format/manifest.rs b/icechunk/src/format/manifest.rs index f9e89009d..1b707c592 100644 --- a/icechunk/src/format/manifest.rs +++ b/icechunk/src/format/manifest.rs @@ -2,6 +2,7 @@ use std::{ borrow::Cow, cmp::{max, min}, convert::Infallible, + iter::zip, ops::Range, sync::Arc, }; @@ -59,16 +60,16 @@ impl ManifestExtents { pub fn intersection(&self, other: &Self) -> Option { debug_assert_eq!(self.len(), other.len()); - let ranges = std::iter::zip(self.iter(), other.iter()) + let ranges = zip(self.iter(), other.iter()) .map(|(a, b)| max(a.start, b.start)..min(a.end, b.end)) .collect::>(); - if any(ranges.iter(), |r| r.end < r.start) { None } else { Some(Self(ranges)) } + if any(ranges.iter(), |r| r.end <= r.start) { None } else { Some(Self(ranges)) } } pub fn union(&self, other: &Self) -> Self { debug_assert_eq!(self.len(), other.len()); Self::from_ranges_iter( - std::iter::zip(self.iter(), other.iter()) + zip(self.iter(), other.iter()) .map(|(a, b)| min(a.start, b.start)..max(a.end, b.end)), ) } diff --git a/icechunk/src/format/transaction_log.rs b/icechunk/src/format/transaction_log.rs index 06324b833..2ab20d3d1 100644 --- a/icechunk/src/format/transaction_log.rs +++ b/icechunk/src/format/transaction_log.rs @@ -42,7 +42,7 @@ impl TransactionLog { // these come sorted from the change set let updated_chunks = cs - .chunk_changes() + .changed_chunks() .map(|(node_id, chunks)| { let node_id = generated::ObjectId8::new(&node_id.0); let node_id = Some(&node_id); diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index fe27f1e41..38404267b 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -214,27 +214,17 @@ pub enum Overlap { pub fn overlaps(us: &ManifestExtents, them: &ManifestExtents) -> Overlap { debug_assert!(us.len() == them.len()); - let mut overlaps = vec![]; + let mut overlap = Overlap::Complete; for (a, b) in zip(us.iter(), them.iter()) { debug_assert!(a.start <= a.end, "Invalid range: {:?}", a.clone()); debug_assert!(b.start <= b.end, "Invalid range: {:?}", b.clone()); - - if (a.start <= b.start) && (a.end >= b.end) { - overlaps.push(Overlap::Complete); - } else if (a.end <= b.start) || (a.start >= b.end) { - overlaps.push(Overlap::None); - } else { - overlaps.push(Overlap::Partial) + if (a.end <= b.start) || (a.start >= b.end) { + return Overlap::None; + } else if !((a.start <= b.start) && (a.end >= b.end)) { + overlap = Overlap::Partial } } - - if overlaps.iter().all(|x| x == &Overlap::Complete) { - Overlap::Complete - } else if overlaps.iter().any(|x| x == &Overlap::None) { - Overlap::None - } else { - Overlap::Partial - } + overlap } #[derive(Clone, Debug, Serialize, Deserialize)] @@ -2476,6 +2466,40 @@ mod tests { assert_eq!(overlaps(&e1, &e2), Overlap::None); assert_eq!(overlaps(&e2, &e1), Overlap::None); + // asymmetric case + let e1 = ManifestExtents::new( + vec![0u32, 1, 2].as_slice(), + vec![3u32, 4, 6].as_slice(), + ); + let e2 = ManifestExtents::new( + vec![2u32, 1, 2].as_slice(), + vec![3u32, 4, 6].as_slice(), + ); + let union = ManifestExtents::new( + vec![0u32, 1, 2].as_slice(), + vec![3u32, 4, 6].as_slice(), + ); + let intersection = ManifestExtents::new( + vec![2u32, 1, 2].as_slice(), + vec![3u32, 4, 6].as_slice(), + ); + assert_eq!(overlaps(&e1, &e2), Overlap::Complete); + assert_eq!(overlaps(&e2, &e1), Overlap::Partial); + assert_eq!(e1.union(&e2), union.clone()); + assert_eq!(e2.union(&e1), union.clone()); + assert_eq!(e1.intersection(&e2), Some(intersection)); + + // empty set + let e1 = ManifestExtents::new( + vec![0u32, 1, 2].as_slice(), + vec![3u32, 4, 6].as_slice(), + ); + let e2 = ManifestExtents::new( + vec![2u32, 1, 2].as_slice(), + vec![2u32, 4, 6].as_slice(), + ); + assert_eq!(e1.intersection(&e2), None); + // this should create non-overlapping extents let splits = ManifestSplits::from_edges(vec![ vec![0, 10, 20], diff --git a/icechunk/src/strategies.rs b/icechunk/src/strategies.rs index 350a7f57f..d458a702c 100644 --- a/icechunk/src/strategies.rs +++ b/icechunk/src/strategies.rs @@ -101,7 +101,7 @@ pub fn shapes_and_dims(max_ndim: Option) -> impl Strategy impl Strategy { - (vec(0u32..1000u32, ndim), vec(0u32..1000u32, ndim)).prop_map(|(start, delta)| { + (vec(0u32..1000u32, ndim), vec(1u32..1000u32, ndim)).prop_map(|(start, delta)| { let stop = std::iter::zip(start.iter(), delta.iter()) .map(|(s, d)| s + d) .collect::>(); From c93ea844d280df17e2c5d619d3cd88ce857051bb Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 9 Jun 2025 14:23:20 -0600 Subject: [PATCH 10/43] Fix distributed writes. --- icechunk-python/src/session.rs | 6 +++--- icechunk/src/change_set.rs | 25 +++++++++++++++++-------- icechunk/src/session.rs | 8 +++++--- 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/icechunk-python/src/session.rs b/icechunk-python/src/session.rs index cf5306dcc..e5606c76a 100644 --- a/icechunk-python/src/session.rs +++ b/icechunk-python/src/session.rs @@ -165,14 +165,14 @@ impl PySession { pub fn merge(&self, other: &PySession, py: Python<'_>) -> PyResult<()> { // This is blocking function, we need to release the Gil py.allow_threads(move || { - // TODO: Bad clone - let changes = other.0.blocking_read().deref().changes().clone(); + // TODO: bad clone + let other = other.0.blocking_read().deref().clone(); pyo3_async_runtimes::tokio::get_runtime().block_on(async move { self.0 .write() .await - .merge(changes) + .merge(other) .await .map_err(PyIcechunkStoreError::SessionError)?; Ok(()) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index 302d1d7fb..71baaa6f8 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -1,5 +1,5 @@ use std::{ - collections::{BTreeMap, HashMap, HashSet}, + collections::{BTreeMap, HashMap, HashSet, btree_map, hash_map}, iter, }; @@ -326,14 +326,23 @@ impl ChangeSet { self.deleted_groups.extend(other.deleted_groups); self.deleted_arrays.extend(other.deleted_arrays); - for (node, other_chunks) in other.set_chunks.into_iter() { - match self.set_chunks.remove(&node) { - Some(mut old_value) => { - old_value.extend(other_chunks); - self.set_chunks.insert(node, old_value); + for (node, other_splits) in other.set_chunks.into_iter() { + // this complicated matching code avoids cloning `other.set_chunks`, which could be quite large + match self.set_chunks.entry(node) { + btree_map::Entry::Occupied(mut entry) => { + for (extent, their_split) in other_splits.into_iter() { + match entry.get_mut().entry(extent) { + hash_map::Entry::Occupied(mut our_split) => { + our_split.get_mut().extend(their_split); + } + hash_map::Entry::Vacant(entry) => { + entry.insert(their_split); + } + } + } } - None => { - self.set_chunks.insert(node, other_chunks); + btree_map::Entry::Vacant(entry) => { + entry.insert(other_splits); } } } diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 38404267b..25c2785b1 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -921,12 +921,14 @@ impl Session { } /// Merge a set of `ChangeSet`s into the repository without committing them - #[instrument(skip(self, changes))] - pub async fn merge(&mut self, changes: ChangeSet) -> SessionResult<()> { + #[instrument(skip(self, other))] + pub async fn merge(&mut self, other: Session) -> SessionResult<()> { if self.read_only() { return Err(SessionErrorKind::ReadOnlySession.into()); } - self.change_set.merge(changes); + let Session { splits, change_set, .. } = other; + self.splits.extend(splits); + self.change_set.merge(change_set); Ok(()) } From 46638dee9ddc716036591b63601eeb2317acec3c Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 9 Jun 2025 18:55:45 -0600 Subject: [PATCH 11/43] Handle appends --- icechunk/src/change_set.rs | 32 +++++++++++++++++++++++++++++++- icechunk/src/session.rs | 10 ++++++---- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index 71baaa6f8..d9130839f 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -103,7 +103,13 @@ impl ChangeSet { self.new_arrays.insert(path, (node_id, array_data)); } - pub fn update_array(&mut self, node_id: &NodeId, path: &Path, array_data: ArrayData) { + pub fn update_array( + &mut self, + node_id: &NodeId, + path: &Path, + array_data: ArrayData, + new_splits: &ManifestSplits, + ) { match self.new_arrays.get(path) { Some((id, _)) => { debug_assert!(!self.updated_arrays.contains_key(id)); @@ -113,6 +119,30 @@ impl ChangeSet { self.updated_arrays.insert(node_id.clone(), array_data); } } + + // update existing splits + if let Some(manifests) = self.set_chunks.remove(node_id) { + let mut new_manifests = + HashMap::::with_capacity( + new_splits.len(), + ); + for (old_extents, chunks) in manifests.into_iter() { + // FIXME: these extents had better be compatible + // test_cases: increasing size of (multiple) dimensions + // decreasing size of (multiple) dimensions + // + new_splits + .iter() + .find(|x| { + // case of increased dimension size + old_extents.overlap_with(*x) == Overlap::Complete + // case of decreased dimension size + || (*x).overlap_with(&old_extents) == Overlap::Complete + }) + .map(|extents| new_manifests.insert(extents.clone(), chunks)); + } + self.set_chunks.insert(node_id.clone(), new_manifests); + } } pub fn update_group(&mut self, node_id: &NodeId, path: &Path, definition: Bytes) { diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 25c2785b1..914330ce1 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -455,14 +455,13 @@ impl Session { user_data: Bytes, ) -> SessionResult<()> { self.get_array(path).await.map(|node| { - // Q: What happens if we set a chunk, then change a dimension name, so - // that the split changes. - // A: We ignore it. splits are set once for a node in a session, and are never changed. - // self.cache_splits(&node.id, path, &shape, &dimension_names); + // needed to handle a resize for example. + self.cache_splits(&node.id, path, &shape, &dimension_names); self.change_set.update_array( &node.id, path, ArrayData { shape, dimension_names, user_data }, + self.splits.get(&node.id).expect("getting splits should not fail."), ) }) } @@ -530,6 +529,9 @@ impl Session { shape: &ArrayShape, dimension_names: &Option>, ) { + // FIXME: handle conflicts here + // Q: What happens if we set a chunk, then change a dimension name, so + // that the split changes. let splitting = self.config.manifest().splitting(); let splits = splitting.get_split_sizes(path, shape, dimension_names); self.splits.insert(node_id.clone(), splits); From ac1d24b41ccfb1248cae1af05a56a9955ca82e71 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 9 Jun 2025 18:56:48 -0600 Subject: [PATCH 12/43] Refactor overlaps to overlap_with --- icechunk/src/change_set.rs | 2 +- icechunk/src/format/manifest.rs | 24 ++++++++++++ icechunk/src/session.rs | 69 +++++++++++---------------------- 3 files changed, 48 insertions(+), 47 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index d9130839f..e9df5209f 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -10,7 +10,7 @@ use serde::{Deserialize, Serialize}; use crate::{ format::{ ChunkIndices, NodeId, Path, - manifest::{ChunkInfo, ChunkPayload, ManifestExtents, ManifestSplits}, + manifest::{ChunkInfo, ChunkPayload, ManifestExtents, ManifestSplits, Overlap}, snapshot::{ArrayShape, DimensionName, NodeData, NodeSnapshot}, }, session::{SessionResult, which_extent_and_index}, diff --git a/icechunk/src/format/manifest.rs b/icechunk/src/format/manifest.rs index 1b707c592..96a4027ae 100644 --- a/icechunk/src/format/manifest.rs +++ b/icechunk/src/format/manifest.rs @@ -25,6 +25,13 @@ use super::{ ChunkId, ChunkIndices, ChunkLength, ChunkOffset, IcechunkResult, ManifestId, NodeId, }; +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Overlap { + Complete, + Partial, + None, +} + #[derive(Debug, Clone, Hash, PartialEq, Eq, Serialize, Deserialize)] pub struct ManifestExtents(Vec>); @@ -73,6 +80,23 @@ impl ManifestExtents { .map(|(a, b)| min(a.start, b.start)..max(a.end, b.end)), ) } + + pub fn overlap_with(&self, other: &Self) -> Overlap { + // Important: this is not symmetric. + debug_assert!(self.len() == other.len()); + + let mut overlap = Overlap::Complete; + for (a, b) in zip(other.iter(), self.iter()) { + debug_assert!(a.start <= a.end, "Invalid range: {:?}", a.clone()); + debug_assert!(b.start <= b.end, "Invalid range: {:?}", b.clone()); + if (a.end <= b.start) || (a.start >= b.end) { + return Overlap::None; + } else if !((a.start <= b.start) && (a.end >= b.end)) { + overlap = Overlap::Partial + } + } + overlap + } } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 914330ce1..0700e292f 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -3,7 +3,6 @@ use std::{ collections::{HashMap, HashSet}, convert::Infallible, future::{Future, ready}, - iter::zip, ops::Range, pin::Pin, sync::Arc, @@ -33,8 +32,9 @@ use crate::{ IcechunkFormatErrorKind, ManifestId, NodeId, ObjectId, Path, SnapshotId, manifest::{ ChunkInfo, ChunkPayload, ChunkRef, Manifest, ManifestExtents, ManifestRef, - ManifestSplits, VirtualChunkLocation, VirtualChunkRef, VirtualReferenceError, - VirtualReferenceErrorKind, uniform_manifest_split_edges, + ManifestSplits, Overlap, VirtualChunkLocation, VirtualChunkRef, + VirtualReferenceError, VirtualReferenceErrorKind, + uniform_manifest_split_edges, }, snapshot::{ ArrayShape, DimensionName, ManifestFileInfo, NodeData, NodeSnapshot, @@ -203,30 +203,6 @@ impl ManifestSplits { } } -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum Overlap { - Complete, - Partial, - None, -} - -/// Important: this is not symmetric. -pub fn overlaps(us: &ManifestExtents, them: &ManifestExtents) -> Overlap { - debug_assert!(us.len() == them.len()); - - let mut overlap = Overlap::Complete; - for (a, b) in zip(us.iter(), them.iter()) { - debug_assert!(a.start <= a.end, "Invalid range: {:?}", a.clone()); - debug_assert!(b.start <= b.end, "Invalid range: {:?}", b.clone()); - if (a.end <= b.start) || (a.start >= b.end) { - return Overlap::None; - } else if !((a.start <= b.start) && (a.end >= b.end)) { - overlap = Overlap::Partial - } - } - overlap -} - #[derive(Clone, Debug, Serialize, Deserialize)] pub struct Session { config: RepositoryConfig, @@ -1298,7 +1274,7 @@ async fn verified_node_chunk_iterator<'a>( futures::stream::iter(manifests) .filter(move |manifest_ref| { futures::future::ready(extent.as_ref().is_none_or(|e| { - overlaps(&manifest_ref.extents, e) != Overlap::None + e.overlap_with(&manifest_ref.extents) != Overlap::None })) }) .then(move |manifest_ref| { @@ -1734,7 +1710,7 @@ impl<'a> FlushProcess<'a> { for old_ref in manifests.iter() { // Remember that the extents written to disk are the `from`:`to` ranges // of populated chunks - match overlaps(&old_ref.extents, extent) { + match extent.overlap_with(&old_ref.extents) { Overlap::Complete => { debug_assert!(on_disk_bbox.is_some()); // Just propagate this ref again, no rewriting necessary @@ -2343,7 +2319,7 @@ mod tests { ) { prop_assert_eq!(e.intersection(&e), Some(e.clone())); prop_assert_eq!(e.union(&e), e.clone()); - prop_assert_eq!(overlaps(&e, &e), Overlap::Complete); + prop_assert_eq!(e.overlap_with(&e), Overlap::Complete); } #[proptest] @@ -2360,17 +2336,17 @@ mod tests { prop_assert_eq!(union.intersection(&e2), Some(e2.clone())); // order is important for the next 2 - prop_assert_eq!(overlaps(&union, &e1), Overlap::Complete); - prop_assert_eq!(overlaps(&union, &e2), Overlap::Complete); + prop_assert_eq!(e1.overlap_with(&union), Overlap::Complete); + prop_assert_eq!(e2.overlap_with(&union), Overlap::Complete); if intersection.is_some() { let int = intersection.unwrap(); let expected = if e1 == e1 { Overlap::Complete } else { Overlap::Partial }; - prop_assert_eq!(overlaps(&e1, &int), expected.clone()); - prop_assert_eq!(overlaps(&e2, &int), expected); + prop_assert_eq!(int.overlap_with(&e1), expected.clone()); + prop_assert_eq!(int.overlap_with(&e2), expected); } else { - prop_assert_eq!(overlaps(&e1, &e2), Overlap::None); - prop_assert_eq!(overlaps(&e2, &e1), Overlap::None); + prop_assert_eq!(e2.overlap_with(&e1), Overlap::None); + prop_assert_eq!(e1.overlap_with(&e2), Overlap::None); } } @@ -2393,7 +2369,7 @@ mod tests { if all(delta_left.iter(), |elem| elem == &0i32) && all(delta_right.iter(), |elem| elem == &0i32) { - prop_assert_eq!(overlaps(&extent1, &extent2), Overlap::Complete); + prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::Complete); } let extent2 = ManifestExtents::from_ranges_iter( @@ -2410,7 +2386,7 @@ mod tests { }), ); - prop_assert_eq!(overlaps(&extent1, &extent2), Overlap::None); + prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::None); let extent2 = ManifestExtents::from_ranges_iter( multizip(( @@ -2425,7 +2401,7 @@ mod tests { ..((extent.end as i32 - width - low) as u32) }), ); - prop_assert_eq!(overlaps(&extent1, &extent2), Overlap::None); + prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::None); let extent2 = ManifestExtents::from_ranges_iter( multizip((extent1.iter(), delta_left.iter(), delta_right.iter())).map( @@ -2435,7 +2411,7 @@ mod tests { }, ), ); - prop_assert_eq!(overlaps(&extent1, &extent2), Overlap::Partial); + prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::Partial); } #[icechunk_macros::test] @@ -2455,7 +2431,7 @@ mod tests { vec![12u32, 4, 6].as_slice(), ); - assert_eq!(overlaps(&e1, &e2), Overlap::None); + assert_eq!(e2.overlap_with(&e1), Overlap::None); assert_eq!(e1.intersection(&e2), None); assert_eq!(e1.union(&e2), union); @@ -2467,8 +2443,8 @@ mod tests { vec![2u32, 1, 2].as_slice(), vec![42u32, 4, 6].as_slice(), ); - assert_eq!(overlaps(&e1, &e2), Overlap::None); - assert_eq!(overlaps(&e2, &e1), Overlap::None); + assert_eq!(e2.overlap_with(&e1), Overlap::None); + assert_eq!(e1.overlap_with(&e2), Overlap::None); // asymmetric case let e1 = ManifestExtents::new( @@ -2487,8 +2463,8 @@ mod tests { vec![2u32, 1, 2].as_slice(), vec![3u32, 4, 6].as_slice(), ); - assert_eq!(overlaps(&e1, &e2), Overlap::Complete); - assert_eq!(overlaps(&e2, &e1), Overlap::Partial); + assert_eq!(e2.overlap_with(&e1), Overlap::Complete); + assert_eq!(e1.overlap_with(&e2), Overlap::Partial); assert_eq!(e1.union(&e2), union.clone()); assert_eq!(e2.union(&e1), union.clone()); assert_eq!(e1.intersection(&e2), Some(intersection)); @@ -2511,7 +2487,8 @@ mod tests { vec![0, 21, 22], ]); for vec in splits.iter().combinations(2) { - assert_eq!(overlaps(vec[0], vec[1]), Overlap::None) + assert_eq!(vec[0].overlap_with(vec[1]), Overlap::None); + assert_eq!(vec[1].overlap_with(vec[0]), Overlap::None); } Ok(()) From dae0d6005548ea51acfc6eab6ca6168cbc1b4593 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 9 Jun 2025 18:56:48 -0600 Subject: [PATCH 13/43] Refactor overlaps to overlap_with --- icechunk/src/change_set.rs | 4 ++-- icechunk/src/session.rs | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index e9df5209f..981c77eac 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -135,9 +135,9 @@ impl ChangeSet { .iter() .find(|x| { // case of increased dimension size - old_extents.overlap_with(*x) == Overlap::Complete + old_extents.overlap_with(x) == Overlap::Complete // case of decreased dimension size - || (*x).overlap_with(&old_extents) == Overlap::Complete + || x.overlap_with(&old_extents) == Overlap::Complete }) .map(|extents| new_manifests.insert(extents.clone(), chunks)); } diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 0700e292f..b444e249a 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -437,6 +437,7 @@ impl Session { &node.id, path, ArrayData { shape, dimension_names, user_data }, + #[allow(clippy::expect_used)] self.splits.get(&node.id).expect("getting splits should not fail."), ) }) From f841e3c026409eb5603e19e575ad163ceab08974 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 10 Jun 2025 10:32:24 -0600 Subject: [PATCH 14/43] Cleanup --- icechunk/src/change_set.rs | 44 ++++++++-------------- icechunk/src/session.rs | 77 +++++++++++++++++--------------------- 2 files changed, 51 insertions(+), 70 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index 981c77eac..9128b163d 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -13,7 +13,7 @@ use crate::{ manifest::{ChunkInfo, ChunkPayload, ManifestExtents, ManifestSplits, Overlap}, snapshot::{ArrayShape, DimensionName, NodeData, NodeSnapshot}, }, - session::{SessionResult, which_extent_and_index}, + session::{SessionResult, find_coord}, }; #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] @@ -198,7 +198,7 @@ impl ChangeSet { splits: &ManifestSplits, ) { #[allow(clippy::expect_used)] - let (_, extent) = splits.which_extent_and_index(&coord).expect("logic bug. Trying to set chunk ref but can't find the appropriate split manifest."); + let extent = splits.find(&coord).expect("logic bug. Trying to set chunk ref but can't find the appropriate split manifest."); // this implementation makes delete idempotent // it allows deleting a deleted chunk by repeatedly setting None. self.set_chunks @@ -225,13 +225,10 @@ impl ChangeSet { node_id: &NodeId, coords: &ChunkIndices, ) -> Option<&Option> { - if let Some(node_chunks) = self.set_chunks.get(node_id) { - which_extent_and_index(node_chunks.keys(), coords).ok().and_then( - |(_, extent)| node_chunks.get(&extent).and_then(|s| s.get(coords)), - ) - } else { - None - } + self.set_chunks.get(node_id).and_then(|node_chunks| { + find_coord(node_chunks.keys(), coords) + .and_then(|extent| node_chunks.get(&extent).and_then(|s| s.get(coords))) + }) } /// Drop the updated chunk references for the node. @@ -330,24 +327,13 @@ impl ChangeSet { self.new_arrays.iter().map(|(path, (node_id, _))| (path, node_id)) } - // pub fn take_chunks( - // &mut self, - // ) -> BTreeMap>> { - // take(&mut self.set_chunks) - // } - - // pub fn set_chunks( - // &mut self, - // chunks: BTreeMap>>, - // ) { - // self.set_chunks = chunks - // } - /// Merge this ChangeSet with `other`. /// /// Results of the merge are applied to `self`. Changes present in `other` take precedence over /// `self` changes. pub fn merge(&mut self, other: ChangeSet) { + // FIXME: this should detect conflict, for example, if different writers added on the same + // path, different objects, or if the same path is added and deleted, etc. // TODO: optimize self.new_groups.extend(other.new_groups); self.new_arrays.extend(other.new_arrays); @@ -539,13 +525,13 @@ mod tests { ); assert_eq!(None, change_set.new_arrays_chunk_iterator().next()); - let splits = ManifestSplits::from_edges(vec![vec![0, 10], vec![0, 10]]); + let splits1 = ManifestSplits::from_edges(vec![vec![0, 10], vec![0, 10]]); change_set.set_chunk_ref( node_id1.clone(), ChunkIndices(vec![0, 1]), None, - &splits, + &splits1, ); assert_eq!(None, change_set.new_arrays_chunk_iterator().next()); @@ -553,25 +539,27 @@ mod tests { node_id1.clone(), ChunkIndices(vec![1, 0]), Some(ChunkPayload::Inline("bar1".into())), - &splits, + &splits1, ); change_set.set_chunk_ref( node_id1.clone(), ChunkIndices(vec![1, 1]), Some(ChunkPayload::Inline("bar2".into())), - &splits, + &splits1, ); + + let splits2 = ManifestSplits::from_edges(vec![vec![0, 10]]); change_set.set_chunk_ref( node_id2.clone(), ChunkIndices(vec![0]), Some(ChunkPayload::Inline("baz1".into())), - &splits, + &splits2, ); change_set.set_chunk_ref( node_id2.clone(), ChunkIndices(vec![1]), Some(ChunkPayload::Inline("baz2".into())), - &splits, + &splits2, ); { diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index b444e249a..9af368e40 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -13,7 +13,7 @@ use bytes::Bytes; use chrono::{DateTime, Utc}; use err_into::ErrorInto; use futures::{FutureExt, Stream, StreamExt, TryStreamExt, future::Either, stream}; -use itertools::{Itertools as _, repeat_n}; +use itertools::{Itertools as _, enumerate, repeat_n}; use regex::bytes::Regex; use serde::{Deserialize, Serialize}; use thiserror::Error; @@ -164,10 +164,12 @@ pub type SessionResult = Result; // Returns the index of split_range that includes ChunkIndices // This can be used at write time to split manifests based on the config // and at read time to choose which manifest to query for chunk payload -pub fn which_extent_and_index<'a>( - iter: impl Iterator, - coord: &ChunkIndices, -) -> SessionResult<(usize, ManifestExtents)> { +/// It is useful to have this act on an iterator (e.g. get_chunk_ref) +/// The find method on ManifestSplits is simply a helper. +pub fn find_coord<'a, I>(mut iter: I, coord: &'a ChunkIndices) -> Option +where + I: Iterator, +{ // split_range[i] must bound ChunkIndices // 0 <= return value <= split_range.len() // it is possible that split_range does not include a coord. say we have 2x2 split grid @@ -176,30 +178,27 @@ pub fn which_extent_and_index<'a>( // Since split_range need not form a regular grid, we must iterate through and find the first result. // ManifestExtents in split_range MUST NOT overlap with each other. How do we ensure this? // ndim must be the same - // debug_assert_eq!(coord.0.len(), split_range[0].len()); - // FIXME: could optimize for unbounded single manifest // Note: I don't think we can distinguish between out of bounds index for the array // and an index that is part of a split that hasn't been written yet. - iter.enumerate() - .find(|(_, e)| e.contains(coord.0.as_slice())) - .map(|(i, e)| (i, e.clone())) - .ok_or( - SessionErrorKind::InvalidIndexForSplitManifests { coords: coord.clone() } - .into(), - ) + iter.find(|e| e.contains(coord.0.as_slice())).cloned() +} + +pub fn position_coord<'a, I>(iter: I, coord: &'a ChunkIndices) -> Option +where + I: Iterator, +{ + enumerate(iter).find(|(_, e)| e.contains(coord.0.as_slice())).map(|x| x.0) } impl ManifestSplits { - pub fn which_extent_and_index( - &self, - coord: &ChunkIndices, - ) -> SessionResult<(usize, ManifestExtents)> { - which_extent_and_index(self.iter(), coord) + pub fn find(&self, coord: &ChunkIndices) -> Option { + debug_assert_eq!(coord.0.len(), self.0[0].len()); + find_coord(self.iter(), coord) } - #[cfg(test)] - pub fn which_index(&self, coord: &ChunkIndices) -> SessionResult { - which_extent_and_index(self.iter(), coord).map(|(i, _)| i) + pub fn position(&self, coord: &ChunkIndices) -> Option { + debug_assert_eq!(coord.0.len(), self.0[0].len()); + position_coord(self.iter(), coord) } } @@ -803,16 +802,14 @@ impl Session { // in the changeset, return None to Zarr. return Ok(None); } - let splits = ManifestSplits::from_extents( - manifests.iter().map(|m| m.extents.clone()).collect(), - ); - let (index, _) = match splits.which_extent_and_index(coords) { - Ok(index) => index, + + let index = match position_coord(manifests.iter().map(|m| &m.extents), coords) { + Some(index) => index, // for an invalid coordinate, we bail. // This happens for two cases: // (1) the "coords" is out-of-range for the array shape // (2) the "coords" belongs to a shard that hasn't been written yet. - Err(_) => return Ok(None), + None => return Ok(None), }; let manifest = self.fetch_manifest(&manifests[index].object_id).await?; @@ -2530,16 +2527,16 @@ mod tests { async fn test_which_split() -> Result<(), Box> { let splits = ManifestSplits::from_edges(vec![vec![0, 10, 20]]); - assert_eq!(splits.which_index(&ChunkIndices(vec![1])).unwrap(), 0); - assert_eq!(splits.which_index(&ChunkIndices(vec![11])).unwrap(), 1); + assert_eq!(splits.position(&ChunkIndices(vec![1])), Some(0)); + assert_eq!(splits.position(&ChunkIndices(vec![11])), Some(1)); let edges = vec![vec![0, 10, 20], vec![0, 10, 20]]; let splits = ManifestSplits::from_edges(edges); - assert_eq!(splits.which_index(&ChunkIndices(vec![1, 1])).unwrap(), 0); - assert_eq!(splits.which_index(&ChunkIndices(vec![1, 10])).unwrap(), 1); - assert_eq!(splits.which_index(&ChunkIndices(vec![1, 11])).unwrap(), 1); - assert!(splits.which_index(&ChunkIndices(vec![21, 21])).is_err()); + assert_eq!(splits.position(&ChunkIndices(vec![1, 1])), Some(0)); + assert_eq!(splits.position(&ChunkIndices(vec![1, 10])), Some(1)); + assert_eq!(splits.position(&ChunkIndices(vec![1, 11])), Some(1)); + assert!(splits.position(&ChunkIndices(vec![21, 21])).is_none()); Ok(()) } @@ -2791,17 +2788,13 @@ mod tests { // set old array chunk and check them let data = Bytes::copy_from_slice(b"foo".repeat(512).as_slice()); let payload = ds.get_chunk_writer()(data.clone()).await?; - ds.set_chunk_ref(array1_path.clone(), ChunkIndices(vec![0, 0, 0]), Some(payload)) + ds.set_chunk_ref(array1_path.clone(), ChunkIndices(vec![0]), Some(payload)) .await?; let chunk = get_chunk( - ds.get_chunk_reader( - &array1_path, - &ChunkIndices(vec![0, 0, 0]), - &ByteRange::ALL, - ) - .await - .unwrap(), + ds.get_chunk_reader(&array1_path, &ChunkIndices(vec![0]), &ByteRange::ALL) + .await + .unwrap(), ) .await?; assert_eq!(chunk, Some(data)); From 839fdc341c0b14f6513caa4eafeec1fbb51aa4ee Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 10 Jun 2025 17:25:09 -0600 Subject: [PATCH 15/43] cleanup more --- icechunk/src/change_set.rs | 46 +++++++++++++------------------------- 1 file changed, 15 insertions(+), 31 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index 9128b163d..dbbe0b3ae 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -1,5 +1,5 @@ use std::{ - collections::{BTreeMap, HashMap, HashSet, btree_map, hash_map}, + collections::{BTreeMap, HashMap, HashSet}, iter, }; @@ -203,21 +203,15 @@ impl ChangeSet { // it allows deleting a deleted chunk by repeatedly setting None. self.set_chunks .entry(node_id.clone()) - .and_modify(|h| { - h.entry(extent.clone()).or_default().insert(coord.clone(), data.clone()); - }) .or_insert_with(|| { - let mut h = HashMap::< + HashMap::< ManifestExtents, BTreeMap>, - >::with_capacity(splits.len()); - h.entry(extent.clone()) - // TODO: this is duplicative. I can't use `or_default` because it's - // nice to create the HashMap using `with_capacity` - .or_default() - .insert(coord, data); - h - }); + >::with_capacity(splits.len()) + }) + .entry(extent.clone()) + .or_default() + .insert(coord.clone(), data.clone()); } pub fn get_chunk_ref( @@ -342,24 +336,14 @@ impl ChangeSet { self.deleted_groups.extend(other.deleted_groups); self.deleted_arrays.extend(other.deleted_arrays); - for (node, other_splits) in other.set_chunks.into_iter() { - // this complicated matching code avoids cloning `other.set_chunks`, which could be quite large - match self.set_chunks.entry(node) { - btree_map::Entry::Occupied(mut entry) => { - for (extent, their_split) in other_splits.into_iter() { - match entry.get_mut().entry(extent) { - hash_map::Entry::Occupied(mut our_split) => { - our_split.get_mut().extend(their_split); - } - hash_map::Entry::Vacant(entry) => { - entry.insert(their_split); - } - } - } - } - btree_map::Entry::Vacant(entry) => { - entry.insert(other_splits); - } + for (node, other_splits) in other.set_chunks { + let manifests = self.set_chunks.entry(node).or_insert_with(|| { + HashMap::::with_capacity( + other_splits.len(), + ) + }); + for (extent, their_manifest) in other_splits { + manifests.entry(extent).or_default().extend(their_manifest) } } } From 35934a97b63b7a5f47df34b96638e2c4d28ba071 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 10 Jun 2025 17:27:26 -0600 Subject: [PATCH 16/43] more tests --- .../tests/test_zarr/test_stateful.py | 23 ++++++-- icechunk/src/change_set.rs | 36 ++++++++----- icechunk/src/format/manifest.rs | 7 ++- icechunk/src/session.rs | 54 +++++++++++++++++-- 4 files changed, 97 insertions(+), 23 deletions(-) diff --git a/icechunk-python/tests/test_zarr/test_stateful.py b/icechunk-python/tests/test_zarr/test_stateful.py index a3477848f..dc63d8422 100644 --- a/icechunk-python/tests/test_zarr/test_stateful.py +++ b/icechunk-python/tests/test_zarr/test_stateful.py @@ -14,8 +14,11 @@ ) import zarr +import logging from icechunk import Repository, in_memory_storage from zarr.core.buffer import default_buffer_prototype +import hypothesis.extra.numpy as npst +from zarr.storage import LoggingStore from zarr.testing.stateful import ZarrHierarchyStateMachine from zarr.testing.strategies import ( node_names, @@ -44,10 +47,10 @@ def chunk_paths( class ModifiedZarrHierarchyStateMachine(ZarrHierarchyStateMachine): def __init__(self, repo: Repository) -> None: self.repo = repo - store = repo.writable_session("main").store + store = LoggingStore(repo.writable_session("main").store, log_level=logging.INFO + 1) super().__init__(store) - @precondition(lambda self: self.store.session.has_uncommitted_changes) + @precondition(lambda self: self.store._store.session.has_uncommitted_changes) @rule(data=st.data()) def commit_with_check(self, data) -> None: note("committing and checking list_prefix") @@ -57,9 +60,9 @@ def commit_with_check(self, data) -> None: get_before = self._sync(self.store.get(path, prototype=PROTOTYPE)) assert get_before - self.store.session.commit("foo") + self.store._store.session.commit("foo") - self.store = self.repo.writable_session("main").store + self.store = LoggingStore(self.repo.writable_session("main").store) lsafter = sorted(self._sync_iter(self.store.list_prefix(""))) get_after = self._sync(self.store.get(path, prototype=PROTOTYPE)) @@ -182,6 +185,18 @@ def delete_chunk(self, data) -> None: self._sync(self.model.delete(path)) self._sync(self.store.delete(path)) + @precondition(lambda self: bool(self.all_arrays)) + @rule(data=st.data()) + def resize_array(self, data) -> None: + array = data.draw(st.sampled_from(sorted(self.all_arrays))) + model_array = zarr.open_array(path=array, store=self.model) + store_array = zarr.open_array(path=array, store=self.store) + ndim = model_array.ndim + new_shape = data.draw(npst.array_shapes(max_dims=ndim, min_dims=ndim, min_side=1)) + note(f"resizing array from {model_array.shape} to {new_shape}") + model_array.resize(new_shape) + store_array.resize(new_shape) + @precondition(lambda self: bool(self.all_arrays) or bool(self.all_groups)) @rule(data=st.data()) def delete_dir(self, data) -> None: diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index dbbe0b3ae..40ed9facb 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -126,20 +126,28 @@ impl ChangeSet { HashMap::::with_capacity( new_splits.len(), ); - for (old_extents, chunks) in manifests.into_iter() { - // FIXME: these extents had better be compatible - // test_cases: increasing size of (multiple) dimensions - // decreasing size of (multiple) dimensions - // - new_splits - .iter() - .find(|x| { - // case of increased dimension size - old_extents.overlap_with(x) == Overlap::Complete - // case of decreased dimension size - || x.overlap_with(&old_extents) == Overlap::Complete - }) - .map(|extents| new_manifests.insert(extents.clone(), chunks)); + for (old_extents, mut chunks) in manifests.into_iter() { + for new_extents in new_splits.iter() { + if old_extents.overlap_with(new_extents) == Overlap::None { + continue; + } + + // TODO: replace with `BTreeMap.drain_filter` after it is stable. + let mut extracted = + BTreeMap::>::new(); + chunks.retain(|coord, payload| { + if new_extents.contains(coord.0.as_slice()) { + extracted.insert(coord.clone(), payload.clone()); + false + } else { + true + } + }); + new_manifests + .entry(new_extents.clone()) + .or_default() + .extend(extracted); + } } self.set_chunks.insert(node_id.clone(), new_manifests); } diff --git a/icechunk/src/format/manifest.rs b/icechunk/src/format/manifest.rs index 96a4027ae..9e0ae5b55 100644 --- a/icechunk/src/format/manifest.rs +++ b/icechunk/src/format/manifest.rs @@ -83,7 +83,12 @@ impl ManifestExtents { pub fn overlap_with(&self, other: &Self) -> Overlap { // Important: this is not symmetric. - debug_assert!(self.len() == other.len()); + debug_assert!( + self.len() == other.len(), + "Length mismatch: self = {:?}, other = {:?}", + &self, + &other + ); let mut overlap = Overlap::Complete; for (a, b) in zip(other.iter(), self.iter()) { diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 9af368e40..f8a809953 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -2772,13 +2772,13 @@ mod tests { let array_def3 = Bytes::from_static(br#"{"this":"more arrays"}"#); ds.update_array( - &array1_path.clone(), + &new_array_path.clone(), shape3.clone(), dimension_names3.clone(), array_def3.clone(), ) .await?; - let node = ds.get_node(&array1_path).await; + let node = ds.get_node(&new_array_path).await; if let Ok(NodeSnapshot { node_data: NodeData::Array { shape, .. }, .. }) = &node { assert_eq!(shape, &shape3); } else { @@ -2788,11 +2788,57 @@ mod tests { // set old array chunk and check them let data = Bytes::copy_from_slice(b"foo".repeat(512).as_slice()); let payload = ds.get_chunk_writer()(data.clone()).await?; - ds.set_chunk_ref(array1_path.clone(), ChunkIndices(vec![0]), Some(payload)) + ds.set_chunk_ref(new_array_path.clone(), ChunkIndices(vec![0]), Some(payload)) .await?; let chunk = get_chunk( - ds.get_chunk_reader(&array1_path, &ChunkIndices(vec![0]), &ByteRange::ALL) + ds.get_chunk_reader(&new_array_path, &ChunkIndices(vec![0]), &ByteRange::ALL) + .await + .unwrap(), + ) + .await?; + assert_eq!(chunk, Some(data)); + + // reduce size of dimension + // // update old array zarr metadata and check it + let shape4 = ArrayShape::new(vec![(6, 3)]).unwrap(); + let array_def3 = Bytes::from_static(br#"{"this":"more arrays"}"#); + ds.update_array( + &new_array_path.clone(), + shape4.clone(), + dimension_names3.clone(), + array_def3.clone(), + ) + .await?; + let node = ds.get_node(&new_array_path).await; + if let Ok(NodeSnapshot { node_data: NodeData::Array { shape, .. }, .. }) = &node { + assert_eq!(shape, &shape4); + } else { + panic!("Failed to update zarr metadata"); + } + + // set old array chunk and check them + let data = Bytes::copy_from_slice(b"old".repeat(512).as_slice()); + let payload = ds.get_chunk_writer()(data.clone()).await?; + ds.set_chunk_ref(new_array_path.clone(), ChunkIndices(vec![0]), Some(payload)) + .await?; + let data = Bytes::copy_from_slice(b"new".repeat(512).as_slice()); + let payload = ds.get_chunk_writer()(data.clone()).await?; + ds.set_chunk_ref(new_array_path.clone(), ChunkIndices(vec![1]), Some(payload)) + .await?; + + let chunk = get_chunk( + ds.get_chunk_reader(&new_array_path, &ChunkIndices(vec![1]), &ByteRange::ALL) + .await + .unwrap(), + ) + .await?; + assert_eq!(chunk, Some(data.clone())); + + ds.commit("commit", Some(SnapshotProperties::default())).await?; + + let chunk = get_chunk( + ds.get_chunk_reader(&new_array_path, &ChunkIndices(vec![1]), &ByteRange::ALL) .await .unwrap(), ) From 4b3cde397f5ba8e8383642069f1892b85dd8111d Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 10 Jun 2025 19:26:46 -0600 Subject: [PATCH 17/43] more cleanup --- icechunk/src/change_set.rs | 13 ++++++------- icechunk/src/session.rs | 8 ++------ 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index 40ed9facb..7337fcc77 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -136,12 +136,11 @@ impl ChangeSet { let mut extracted = BTreeMap::>::new(); chunks.retain(|coord, payload| { - if new_extents.contains(coord.0.as_slice()) { + let cond = new_extents.contains(coord.0.as_slice()); + if cond { extracted.insert(coord.clone(), payload.clone()); - false - } else { - true } + !cond }); new_manifests .entry(new_extents.clone()) @@ -295,17 +294,17 @@ impl ChangeSet { ) } - pub fn array_manifests_iterator( + pub fn modified_manifest_extents_iterator( &self, node_id: &NodeId, node_path: &Path, - ) -> impl Iterator + use<'_> { + ) -> impl Iterator + use<'_> { if self.is_deleted(node_path, node_id) { return Either::Left(iter::empty()); } match self.set_chunks.get(node_id) { None => Either::Left(iter::empty()), - Some(h) => Either::Right(h.iter()), + Some(h) => Either::Right(h.keys()), } } diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index f8a809953..da2d56e23 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -1620,10 +1620,7 @@ impl<'a> FlushProcess<'a> { refs: HashMap, ) -> SessionResult<()> { for ref_ in refs.into_values() { - self.manifest_refs - .entry(node_id.clone()) - .and_modify(|v| v.push(ref_.clone())) - .or_insert_with(|| vec![ref_]); + self.manifest_refs.entry(node_id.clone()).or_default().push(ref_); } Ok(()) } @@ -1682,8 +1679,7 @@ impl<'a> FlushProcess<'a> { let modified_splits = self .change_set - .array_manifests_iterator(&node.id, &node.path) - .map(|(extents, _)| extents) + .modified_manifest_extents_iterator(&node.id, &node.path) .collect::>(); // FIXME: there is an invariant here From b9d3ec825d7e10475f1d1a89e8b81dc9197ab397 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 11 Jun 2025 12:57:55 -0600 Subject: [PATCH 18/43] note --- icechunk/src/change_set.rs | 1 + icechunk/src/session.rs | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index 7337fcc77..edd8dc9ae 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -69,6 +69,7 @@ impl ChangeSet { } pub fn arrays_with_chunk_changes(&self) -> impl Iterator { + // FIXME: needs test for session with only chunk deletes on existing nodes self.set_chunks.keys() } diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index da2d56e23..a5cbef38a 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -1171,6 +1171,10 @@ async fn updated_chunk_iterator<'a>( let snapshot = asset_manager.fetch_snapshot(snapshot_id).await?; let nodes = futures::stream::iter(snapshot.iter_arc()); let res = nodes.and_then(move |node| async move { + // Note: Confusingly, these NodeSnapshot instances have the metadata stored in the snapshot. + // We have not applied any changeset updates. At the moment, the downstream code only + // use node.id so there is no need to update yet. + Ok(updated_node_chunks_iterator( asset_manager, change_set, From 75d264a8d250241cf4168a694ecffb87476cb1c2 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 11 Jun 2025 12:58:54 -0600 Subject: [PATCH 19/43] Track _all_ deleted chunks separately --- icechunk/src/change_set.rs | 99 +++++++++++++++++++++++++++++--------- 1 file changed, 76 insertions(+), 23 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index edd8dc9ae..2fa233f64 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -31,6 +31,9 @@ pub struct ChangeSet { updated_arrays: HashMap, updated_groups: HashMap, set_chunks: BTreeMap>, + // we track deleted chunk indices separately to handle the case when resizing + // an array changes the splits, and we might lose record of chunk deletes. + deleted_chunks: HashMap>, deleted_groups: HashSet<(Path, NodeId)>, deleted_arrays: HashSet<(Path, NodeId)>, } @@ -60,12 +63,25 @@ impl ChangeSet { &self, ) -> impl Iterator)> { self.set_chunks.iter().map(|(node_id, split_map)| { - (node_id, split_map.values().flat_map(|x| x.keys())) + let deletes = self + .deleted_chunks + .get(node_id) + .map(|x| Either::Right(x.iter())) + .or_else(|| Some(Either::Left(iter::empty()))) + .unwrap(); + ( + node_id, + Either::, _>::Right( + split_map.values().flat_map(|x| x.keys()), + ) + .chain(deletes), + ) }) } pub fn has_chunk_changes(&self, node: &NodeId) -> bool { self.set_chunks.get(node).map(|m| !m.is_empty()).unwrap_or(false) + || self.deleted_chunks.get(node).map(|m| !m.is_empty()).unwrap_or(false) } pub fn arrays_with_chunk_changes(&self) -> impl Iterator { @@ -205,21 +221,43 @@ impl ChangeSet { data: Option, splits: &ManifestSplits, ) { - #[allow(clippy::expect_used)] - let extent = splits.find(&coord).expect("logic bug. Trying to set chunk ref but can't find the appropriate split manifest."); - // this implementation makes delete idempotent - // it allows deleting a deleted chunk by repeatedly setting None. - self.set_chunks - .entry(node_id.clone()) - .or_insert_with(|| { - HashMap::< - ManifestExtents, - BTreeMap>, - >::with_capacity(splits.len()) - }) - .entry(extent.clone()) - .or_default() - .insert(coord.clone(), data.clone()); + // deletes must always be recorded, since the chunk might exist in the on-disk manifest + let deletes = self.deleted_chunks.entry(node_id.clone()).or_default(); + let extent = splits.find(&coord); + match data { + Some(payload) => { + deletes.remove(&coord); + self.set_chunks + .entry(node_id) + .or_insert_with(|| { + HashMap::::with_capacity( + splits.len(), + ) + }) + .entry(extent.expect("grabbing this extent should succeed")) + // at this point, we have valid data so we must always insert the ref + .or_default() + .insert(coord.clone(), Some(payload)); + } + None => { + // this implementation makes delete idempotent + // it allows deleting a deleted chunk by repeatedly setting None. + deletes.insert(coord.clone()); + if let Some(extent) = extent { + // both uses of `or_default` are a bit too clever. + // I am making sure this extent is present in the map, + // so that `modified_manifest_extents_iterator` picks it up + self.set_chunks + .entry(node_id) + .or_default() + .entry(extent) + .and_modify(|manifest| { + manifest.remove(&coord); + }) + .or_default(); + }; + } + } } pub fn get_chunk_ref( @@ -227,9 +265,16 @@ impl ChangeSet { node_id: &NodeId, coords: &ChunkIndices, ) -> Option<&Option> { - self.set_chunks.get(node_id).and_then(|node_chunks| { - find_coord(node_chunks.keys(), coords) - .and_then(|extent| node_chunks.get(&extent).and_then(|s| s.get(coords))) + self.deleted_chunks.get(node_id).and_then(|deletes| { + if deletes.contains(coords) { + Some(&None) + } else { + self.set_chunks.get(node_id).and_then(|node_chunks| { + find_coord(node_chunks.keys(), coords).and_then(|extent| { + node_chunks.get(&extent).and_then(|s| s.get(coords)) + }) + }) + } }) } @@ -258,14 +303,22 @@ impl ChangeSet { } match self.set_chunks.get(node_id) { None => Either::Left(iter::empty()), - Some(h) => Either::Right( - h.iter() + Some(h) => { + let set_chunks = h + .iter() // FIXME: review this .filter(move |(manifest_extent, _)| { extent.is_none() || Some(*manifest_extent) == extent.as_ref() }) - .flat_map(|(_, manifest)| manifest.iter()), - ), + .flat_map(|(_, manifest)| manifest.iter()); + let deleted_chunks = match self.deleted_chunks.get(node_id) { + Some(deletes) => Either::Right( + deletes.iter().map(|coord| (coord, &None::)), + ), + None => Either::Left(iter::empty()), + }; + Either::Right(set_chunks.chain(deleted_chunks)) + } } } From f88c0f5021214b3280c1f1286863914fe0ba903d Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 11 Jun 2025 12:59:30 -0600 Subject: [PATCH 20/43] Revert "Track _all_ deleted chunks separately" This reverts commit a6b14e76b28d1919e05db9da52c9f69da5d4015c. --- icechunk/src/change_set.rs | 99 +++++++++----------------------------- 1 file changed, 23 insertions(+), 76 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index 2fa233f64..edd8dc9ae 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -31,9 +31,6 @@ pub struct ChangeSet { updated_arrays: HashMap, updated_groups: HashMap, set_chunks: BTreeMap>, - // we track deleted chunk indices separately to handle the case when resizing - // an array changes the splits, and we might lose record of chunk deletes. - deleted_chunks: HashMap>, deleted_groups: HashSet<(Path, NodeId)>, deleted_arrays: HashSet<(Path, NodeId)>, } @@ -63,25 +60,12 @@ impl ChangeSet { &self, ) -> impl Iterator)> { self.set_chunks.iter().map(|(node_id, split_map)| { - let deletes = self - .deleted_chunks - .get(node_id) - .map(|x| Either::Right(x.iter())) - .or_else(|| Some(Either::Left(iter::empty()))) - .unwrap(); - ( - node_id, - Either::, _>::Right( - split_map.values().flat_map(|x| x.keys()), - ) - .chain(deletes), - ) + (node_id, split_map.values().flat_map(|x| x.keys())) }) } pub fn has_chunk_changes(&self, node: &NodeId) -> bool { self.set_chunks.get(node).map(|m| !m.is_empty()).unwrap_or(false) - || self.deleted_chunks.get(node).map(|m| !m.is_empty()).unwrap_or(false) } pub fn arrays_with_chunk_changes(&self) -> impl Iterator { @@ -221,43 +205,21 @@ impl ChangeSet { data: Option, splits: &ManifestSplits, ) { - // deletes must always be recorded, since the chunk might exist in the on-disk manifest - let deletes = self.deleted_chunks.entry(node_id.clone()).or_default(); - let extent = splits.find(&coord); - match data { - Some(payload) => { - deletes.remove(&coord); - self.set_chunks - .entry(node_id) - .or_insert_with(|| { - HashMap::::with_capacity( - splits.len(), - ) - }) - .entry(extent.expect("grabbing this extent should succeed")) - // at this point, we have valid data so we must always insert the ref - .or_default() - .insert(coord.clone(), Some(payload)); - } - None => { - // this implementation makes delete idempotent - // it allows deleting a deleted chunk by repeatedly setting None. - deletes.insert(coord.clone()); - if let Some(extent) = extent { - // both uses of `or_default` are a bit too clever. - // I am making sure this extent is present in the map, - // so that `modified_manifest_extents_iterator` picks it up - self.set_chunks - .entry(node_id) - .or_default() - .entry(extent) - .and_modify(|manifest| { - manifest.remove(&coord); - }) - .or_default(); - }; - } - } + #[allow(clippy::expect_used)] + let extent = splits.find(&coord).expect("logic bug. Trying to set chunk ref but can't find the appropriate split manifest."); + // this implementation makes delete idempotent + // it allows deleting a deleted chunk by repeatedly setting None. + self.set_chunks + .entry(node_id.clone()) + .or_insert_with(|| { + HashMap::< + ManifestExtents, + BTreeMap>, + >::with_capacity(splits.len()) + }) + .entry(extent.clone()) + .or_default() + .insert(coord.clone(), data.clone()); } pub fn get_chunk_ref( @@ -265,16 +227,9 @@ impl ChangeSet { node_id: &NodeId, coords: &ChunkIndices, ) -> Option<&Option> { - self.deleted_chunks.get(node_id).and_then(|deletes| { - if deletes.contains(coords) { - Some(&None) - } else { - self.set_chunks.get(node_id).and_then(|node_chunks| { - find_coord(node_chunks.keys(), coords).and_then(|extent| { - node_chunks.get(&extent).and_then(|s| s.get(coords)) - }) - }) - } + self.set_chunks.get(node_id).and_then(|node_chunks| { + find_coord(node_chunks.keys(), coords) + .and_then(|extent| node_chunks.get(&extent).and_then(|s| s.get(coords))) }) } @@ -303,22 +258,14 @@ impl ChangeSet { } match self.set_chunks.get(node_id) { None => Either::Left(iter::empty()), - Some(h) => { - let set_chunks = h - .iter() + Some(h) => Either::Right( + h.iter() // FIXME: review this .filter(move |(manifest_extent, _)| { extent.is_none() || Some(*manifest_extent) == extent.as_ref() }) - .flat_map(|(_, manifest)| manifest.iter()); - let deleted_chunks = match self.deleted_chunks.get(node_id) { - Some(deletes) => Either::Right( - deletes.iter().map(|coord| (coord, &None::)), - ), - None => Either::Left(iter::empty()), - }; - Either::Right(set_chunks.chain(deleted_chunks)) - } + .flat_map(|(_, manifest)| manifest.iter()), + ), } } From d6d1a9088162fdc27714ec66f830365b05816aeb Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 11 Jun 2025 14:15:49 -0600 Subject: [PATCH 21/43] Track deleted chunks _outside_ array shape only --- icechunk/src/change_set.rs | 40 +++++++++++ icechunk/src/session.rs | 132 ++++++++++++++++++++++++++++++++++++- 2 files changed, 169 insertions(+), 3 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index edd8dc9ae..fb4cff0d9 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -31,6 +31,10 @@ pub struct ChangeSet { updated_arrays: HashMap, updated_groups: HashMap, set_chunks: BTreeMap>, + // This map keeps track of any chunk deletes that are + // outside the domain of the current array shape. This is needed to handle + // the very unlikely case of multiple resizes in the same session. + deleted_chunks_outside_bounds: BTreeMap>, deleted_groups: HashSet<(Path, NodeId)>, deleted_arrays: HashSet<(Path, NodeId)>, } @@ -123,6 +127,7 @@ impl ChangeSet { // update existing splits if let Some(manifests) = self.set_chunks.remove(node_id) { + let mut new_deleted_chunks = HashSet::::new(); let mut new_manifests = HashMap::::with_capacity( new_splits.len(), @@ -148,8 +153,31 @@ impl ChangeSet { .or_default() .extend(extracted); } + new_deleted_chunks.extend( + chunks.into_iter().filter_map(|(coord, payload)| { + payload.is_none().then_some(coord) + }), + ); } + + // bring back any previously tracked deletes + if let Some(deletes) = self.deleted_chunks_outside_bounds.get(node_id) { + for coord in deletes.iter() { + if let Some(extents) = new_splits.find(coord) { + new_manifests + .entry(extents) + .or_default() + .insert(coord.clone(), None); + }; + } + }; self.set_chunks.insert(node_id.clone(), new_manifests); + + // keep track of any deletes not inserted in to set_chunks + self.deleted_chunks_outside_bounds + .entry(node_id.clone()) + .or_default() + .extend(new_deleted_chunks); } } @@ -247,6 +275,16 @@ impl ChangeSet { } } + pub fn deleted_chunks_iterator( + &self, + node_id: &NodeId, + ) -> impl Iterator { + match self.deleted_chunks_outside_bounds.get(node_id) { + Some(deletes) => Either::Right(deletes.iter()), + None => Either::Left(iter::empty()), + } + } + pub fn array_chunks_iterator( &self, node_id: &NodeId, @@ -343,6 +381,8 @@ impl ChangeSet { self.updated_arrays.extend(other.updated_arrays); self.deleted_groups.extend(other.deleted_groups); self.deleted_arrays.extend(other.deleted_arrays); + // FIXME: do we even test this? + self.deleted_chunks_outside_bounds.extend(other.deleted_chunks_outside_bounds); for (node, other_splits) in other.set_chunks { let manifests = self.set_chunks.entry(node).or_insert_with(|| { diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index a5cbef38a..c950e2a9e 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -637,9 +637,12 @@ impl Session { message: "getting chunk reference".to_string(), } .into()), - NodeData::Array { manifests, .. } => { - // Note: at this point, coords could be invalid for the array shape - // but we just let that pass. + NodeData::Array { shape, manifests, .. } => { + if !shape.valid_chunk_coord(coords) { + // this chunk ref cannot exist + return Ok(None); + } + // check the chunks modified in this session first // TODO: I hate rust forces me to clone to search in a hashmap. How to do better? let session_chunk = @@ -1254,6 +1257,9 @@ async fn verified_node_chunk_iterator<'a>( change_set .array_chunks_iterator(&node.id, &node.path, extent.clone()) .map(|(idx, _)| idx) + // by chaining here, we make sure we don't pull from the manifest + // any chunks that were deleted prior to resizing in this session + .chain(change_set.deleted_chunks_iterator(&node.id)) .collect(), ); @@ -2133,6 +2139,7 @@ mod tests { use crate::{ ObjectStorage, Repository, + config::{ManifestConfig, ManifestSplitCondition}, conflicts::{ basic_solver::{BasicConflictSolver, VersionSelection}, detector::ConflictDetector, @@ -2587,6 +2594,125 @@ mod tests { Ok(()) } + #[tokio_test] + async fn test_repository_with_splits_and_resizes() -> Result<(), Box> { + let storage: Arc = new_in_memory_storage().await?; + // let storage_settings = storage.default_settings(); + // let asset_manager = + // AssetManager::new_no_cache(Arc::clone(&storage), storage_settings.clone(), 1); + + let split_sizes = Some(vec![( + ManifestSplitCondition::PathMatches { regex: r".*".to_string() }, + vec![ManifestSplitDim { + condition: ManifestSplitDimCondition::Any, + num_chunks: 2, + }], + )]); + + let man_config = ManifestConfig { + splitting: Some(ManifestSplittingConfig { split_sizes }), + ..ManifestConfig::default() + }; + + let repo = Repository::create( + Some(RepositoryConfig { + inline_chunk_threshold_bytes: Some(0), + manifest: Some(man_config), + ..Default::default() + }), + storage, + HashMap::new(), + ) + .await?; + let mut session = repo.writable_session("main").await?; + session.add_group(Path::root(), Bytes::copy_from_slice(b"")).await?; + + let array_path: Path = "/array".to_string().try_into().unwrap(); + let shape = ArrayShape::new(vec![(4, 1)]).unwrap(); + let dimension_names = Some(vec!["t".into()]); + let array_def = Bytes::from_static(br#"{"this":"other array"}"#); + + session + .add_array( + array_path.clone(), + shape.clone(), + dimension_names.clone(), + array_def.clone(), + ) + .await?; + + let bytes = Bytes::copy_from_slice(&42i8.to_be_bytes()); + for idx in vec![0, 2] { + let payload = session.get_chunk_writer()(bytes.clone()).await?; + session + .set_chunk_ref(array_path.clone(), ChunkIndices(vec![idx]), Some(payload)) + .await?; + } + session.commit("None", None).await?; + + let mut session = repo.writable_session("main").await?; + // This is how Zarr resizes + // first, delete any out of bounds chunks + session.set_chunk_ref(array_path.clone(), ChunkIndices(vec![2]), None).await?; + // second, update metadata + let shape2 = ArrayShape::new(vec![(2, 1)]).unwrap(); + session + .update_array( + &array_path, + shape2.clone(), + dimension_names.clone(), + array_def.clone(), + ) + .await?; + + assert!( + session.get_chunk_ref(&array_path, &ChunkIndices(vec![2])).await?.is_none() + ); + + // resize back to original shape + session + .update_array( + &array_path, + shape.clone(), + dimension_names.clone(), + array_def.clone(), + ) + .await?; + + // should still be deleted + assert!( + session.get_chunk_ref(&array_path, &ChunkIndices(vec![2])).await?.is_none() + ); + + // set another chunk in this split + let payload = session.get_chunk_writer()(bytes.clone()).await?; + session + .set_chunk_ref(array_path.clone(), ChunkIndices(vec![3]), Some(payload)) + .await?; + // should still be deleted + assert!( + session.get_chunk_ref(&array_path, &ChunkIndices(vec![2])).await?.is_none() + ); + // new ref should be present + assert!( + session.get_chunk_ref(&array_path, &ChunkIndices(vec![3])).await?.is_some() + ); + + // write manifests, check number of references in manifest + session.commit("updated", None).await?; + + // should still be deleted + assert!( + session.get_chunk_ref(&array_path, &ChunkIndices(vec![2])).await?.is_none() + ); + // new ref should be present + assert!( + session.get_chunk_ref(&array_path, &ChunkIndices(vec![3])).await?.is_some() + ); + + Ok(()) + } + #[tokio_test] async fn test_repository_with_updates() -> Result<(), Box> { let storage: Arc = new_in_memory_storage().await?; From 9b9712e321b8a4c8ab039cd684f5b40ad504ef6e Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 11 Jun 2025 14:56:26 -0600 Subject: [PATCH 22/43] Update stateful tests --- .../tests/test_stateful_repo_ops.py | 2 +- .../tests/test_zarr/test_stateful.py | 23 +++++++++++++++---- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/icechunk-python/tests/test_stateful_repo_ops.py b/icechunk-python/tests/test_stateful_repo_ops.py index dfdc6a9f9..47dbd4809 100644 --- a/icechunk-python/tests/test_stateful_repo_ops.py +++ b/icechunk-python/tests/test_stateful_repo_ops.py @@ -570,7 +570,7 @@ def check_list_prefix_from_root(self) -> None: # need to load to dict to compare since ordering of entries might differ expected = json.loads(self.model[k].to_bytes()) value = self.sync_store.get(k, default_buffer_prototype()) - assert value is not None + assert value is not None, k actual = json.loads(value.to_bytes()) actual_fv = actual.pop("fill_value") expected_fv = expected.pop("fill_value") diff --git a/icechunk-python/tests/test_zarr/test_stateful.py b/icechunk-python/tests/test_zarr/test_stateful.py index dc63d8422..daa6e8555 100644 --- a/icechunk-python/tests/test_zarr/test_stateful.py +++ b/icechunk-python/tests/test_zarr/test_stateful.py @@ -1,6 +1,7 @@ import json from typing import Any +import hypothesis.extra.numpy as npst import hypothesis.strategies as st import numpy as np import pytest @@ -14,13 +15,11 @@ ) import zarr -import logging from icechunk import Repository, in_memory_storage from zarr.core.buffer import default_buffer_prototype -import hypothesis.extra.numpy as npst -from zarr.storage import LoggingStore from zarr.testing.stateful import ZarrHierarchyStateMachine from zarr.testing.strategies import ( + basic_indices, node_names, np_array_and_chunks, numpy_arrays, @@ -47,7 +46,7 @@ def chunk_paths( class ModifiedZarrHierarchyStateMachine(ZarrHierarchyStateMachine): def __init__(self, repo: Repository) -> None: self.repo = repo - store = LoggingStore(repo.writable_session("main").store, log_level=logging.INFO + 1) + store = repo.writable_session("main").store super().__init__(store) @precondition(lambda self: self.store._store.session.has_uncommitted_changes) @@ -62,7 +61,7 @@ def commit_with_check(self, data) -> None: self.store._store.session.commit("foo") - self.store = LoggingStore(self.repo.writable_session("main").store) + self.store = self.repo.writable_session("main").store lsafter = sorted(self._sync_iter(self.store.list_prefix(""))) get_after = self._sync(self.store.get(path, prototype=PROTOTYPE)) @@ -185,6 +184,20 @@ def delete_chunk(self, data) -> None: self._sync(self.model.delete(path)) self._sync(self.store.delete(path)) + @precondition(lambda self: bool(self.all_arrays)) + @rule(data=st.data()) + def overwrite_array_basic_indexing(self, data) -> None: + array = data.draw(st.sampled_from(sorted(self.all_arrays))) + model_array = zarr.open_array(path=array, store=self.model) + store_array = zarr.open_array(path=array, store=self.store) + slicer = data.draw(basic_indices(shape=model_array.shape)) + note(f"overwriting array basic {slicer=}") + new_data = data.draw( + npst.arrays(shape=model_array[slicer].shape, dtype=model_array.dtype) + ) + model_array[slicer] = new_data + store_array[slicer] = new_data + @precondition(lambda self: bool(self.all_arrays)) @rule(data=st.data()) def resize_array(self, data) -> None: From eaee66a9fdd6b479f17a44e5f4dcc3111ffb0e3f Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 11 Jun 2025 16:21:51 -0600 Subject: [PATCH 23/43] Fix bug & update tests. Closes #604 --- icechunk/src/change_set.rs | 4 ++ icechunk/src/repository.rs | 83 +++++++++++++++++++++++++++++++++++--- icechunk/src/session.rs | 9 ++--- 3 files changed, 86 insertions(+), 10 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index fb4cff0d9..a8061e7dc 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -68,6 +68,10 @@ impl ChangeSet { }) } + pub fn is_updated_array(&self, node: &NodeId) -> bool { + self.updated_arrays.contains_key(node) + } + pub fn has_chunk_changes(&self, node: &NodeId) -> bool { self.set_chunks.get(node).map(|m| !m.is_empty()).unwrap_or(false) } diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs index d778dc9cf..8dc2262f6 100644 --- a/icechunk/src/repository.rs +++ b/icechunk/src/repository.rs @@ -935,6 +935,7 @@ pub async fn raise_if_invalid_snapshot_id( mod tests { use std::{collections::HashMap, error::Error, path::PathBuf, sync::Arc}; + use icechunk_macros::tokio_test; use storage::logging::LoggingStorage; use tempfile::TempDir; @@ -1187,6 +1188,80 @@ mod tests { Ok(repository) } + #[tokio_test] + async fn test_resize_rewrites_manifests() -> Result<(), Box> { + let storage: Arc = new_in_memory_storage().await?; + let repo = Repository::create( + Some(RepositoryConfig { + inline_chunk_threshold_bytes: Some(0), + ..Default::default() + }), + Arc::clone(&storage), + HashMap::new(), + ) + .await?; + let mut session = repo.writable_session("main").await?; + session.add_group(Path::root(), Bytes::copy_from_slice(b"")).await?; + + let array_path: Path = "/array".to_string().try_into().unwrap(); + let shape = ArrayShape::new(vec![(4, 1)]).unwrap(); + let dimension_names = Some(vec!["t".into()]); + let array_def = Bytes::from_static(br#"{"this":"other array"}"#); + + session + .add_array( + array_path.clone(), + shape.clone(), + dimension_names.clone(), + array_def.clone(), + ) + .await?; + + let bytes = Bytes::copy_from_slice(&42i8.to_be_bytes()); + for idx in 0..4 { + let payload = session.get_chunk_writer()(bytes.clone()).await?; + session + .set_chunk_ref(array_path.clone(), ChunkIndices(vec![idx]), Some(payload)) + .await?; + } + session.commit("first commit", None).await?; + assert_manifest_count(&storage, 1).await; + + // Important we are not issuing any chunk deletes here (which is what Zarr does) + // Note we are still rewriting the manifest + let mut session = repo.writable_session("main").await?; + let shape2 = ArrayShape::new(vec![(2, 1)]).unwrap(); + session + .update_array( + &array_path, + shape2.clone(), + dimension_names.clone(), + array_def.clone(), + ) + .await?; + session.commit("second commit", None).await?; + assert_manifest_count(&storage, 2).await; + + // Now we expand the size, but don't write chunks. + // No new manifests need to be written + let mut session = repo.writable_session("main").await?; + let shape3 = ArrayShape::new(vec![(6, 1)]).unwrap(); + session + .update_array( + &array_path, + shape3.clone(), + dimension_names.clone(), + array_def.clone(), + ) + .await?; + session.commit("second commit", None).await?; + assert_manifest_count(&storage, 2).await; + + // FIXME: add more complex splitting test + + Ok(()) + } + #[tokio::test] async fn tests_manifest_splitting_simple() -> Result<(), Box> { let dim_size = 25u32; @@ -1245,7 +1320,7 @@ mod tests { ) .await?; session.commit("last split", None).await?; - total_manifests += 2; // FIXME: this should be +1 once writes are optimized + total_manifests += 1; assert_manifest_count(&storage, total_manifests).await; // check that reads are optimized; we should only fetch the last split for this query @@ -1470,7 +1545,6 @@ mod tests { let ops = logging.fetch_operations(); assert!(ops.is_empty()); - let mut add = 0; for ax in 0..shape.len() { let mut session = repository.writable_session("main").await?; let axis_size = shape.get(ax).unwrap().array_length(); @@ -1486,9 +1560,8 @@ mod tests { .await? } - add += (axis_size as u32).div_ceil(expected_split_sizes[ax]) as usize - - 1 * ((ax > 0) as usize); - total_manifests += add; + total_manifests += + (axis_size as u32).div_ceil(expected_split_sizes[ax]) as usize; session.commit(format!("finished axis {0}", ax).as_ref(), None).await?; assert_manifest_count(&backend, total_manifests).await; diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index c950e2a9e..383ed4d9d 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -1714,7 +1714,7 @@ impl<'a> FlushProcess<'a> { for old_ref in manifests.iter() { // Remember that the extents written to disk are the `from`:`to` ranges // of populated chunks - match extent.overlap_with(&old_ref.extents) { + match &old_ref.extents.overlap_with(extent) { Overlap::Complete => { debug_assert!(on_disk_bbox.is_some()); // Just propagate this ref again, no rewriting necessary @@ -1877,7 +1877,9 @@ async fn flush( continue; } - if flush_data.change_set.has_chunk_changes(node_id) { + if flush_data.change_set.is_updated_array(node_id) + || flush_data.change_set.has_chunk_changes(node_id) + { trace!(path=%node.path, "Node has changes, writing a new manifest"); // Array wasn't deleted and has changes in this session // get the new node to handle changes in size, e.g. appends. @@ -2597,9 +2599,6 @@ mod tests { #[tokio_test] async fn test_repository_with_splits_and_resizes() -> Result<(), Box> { let storage: Arc = new_in_memory_storage().await?; - // let storage_settings = storage.default_settings(); - // let asset_manager = - // AssetManager::new_no_cache(Arc::clone(&storage), storage_settings.clone(), 1); let split_sizes = Some(vec![( ManifestSplitCondition::PathMatches { regex: r".*".to_string() }, From 5f4e93a08980ca61800eb8cadf355c91eb8f0ceb Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 11 Jun 2025 21:49:48 -0600 Subject: [PATCH 24/43] Stricter GC test --- icechunk/tests/test_gc.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/icechunk/tests/test_gc.rs b/icechunk/tests/test_gc.rs index 7e44f3a4c..82da26a64 100644 --- a/icechunk/tests/test_gc.rs +++ b/icechunk/tests/test_gc.rs @@ -108,6 +108,7 @@ pub async fn do_test_gc( let first_snap_id = ds.commit("first", None).await?; assert_eq!(storage.list_chunks(&storage_settings).await?.count().await, 1100); + assert_eq!(storage.list_manifests(&storage_settings).await?.count().await, 110); let mut ds = repo.writable_session("main").await?; @@ -120,6 +121,7 @@ pub async fn do_test_gc( } let second_snap_id = ds.commit("second", None).await?; assert_eq!(storage.list_chunks(&storage_settings).await?.count().await, 1110); + assert_eq!(storage.list_manifests(&storage_settings).await?.count().await, 111); // verify doing gc without dangling objects doesn't change the repo let now = Utc::now(); @@ -155,6 +157,7 @@ pub async fn do_test_gc( // we still have all the chunks assert_eq!(storage.list_chunks(&storage_settings).await?.count().await, 1110); + assert_eq!(storage.list_manifests(&storage_settings).await?.count().await, 111); let summary = garbage_collect( storage.as_ref(), @@ -164,7 +167,7 @@ pub async fn do_test_gc( ) .await?; assert_eq!(summary.chunks_deleted, 10); - assert_eq!(summary.manifests_deleted, 110); + assert_eq!(summary.manifests_deleted, 1); assert_eq!(summary.snapshots_deleted, 1); assert!(summary.bytes_deleted > summary.chunks_deleted); From a15a8acd31bdd97934df6f5a232ee1b631837804 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 12 Jun 2025 10:03:48 -0600 Subject: [PATCH 25/43] Fix stateful test --- icechunk-python/tests/test_zarr/test_stateful.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/icechunk-python/tests/test_zarr/test_stateful.py b/icechunk-python/tests/test_zarr/test_stateful.py index daa6e8555..17e1c7f85 100644 --- a/icechunk-python/tests/test_zarr/test_stateful.py +++ b/icechunk-python/tests/test_zarr/test_stateful.py @@ -49,7 +49,7 @@ def __init__(self, repo: Repository) -> None: store = repo.writable_session("main").store super().__init__(store) - @precondition(lambda self: self.store._store.session.has_uncommitted_changes) + @precondition(lambda self: self.store.session.has_uncommitted_changes) @rule(data=st.data()) def commit_with_check(self, data) -> None: note("committing and checking list_prefix") @@ -59,7 +59,7 @@ def commit_with_check(self, data) -> None: get_before = self._sync(self.store.get(path, prototype=PROTOTYPE)) assert get_before - self.store._store.session.commit("foo") + self.store.session.commit("foo") self.store = self.repo.writable_session("main").store From 0fc8c2a7457ca5463bafc4693def2211366edca6 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 12 Jun 2025 14:58:34 -0600 Subject: [PATCH 26/43] Cleanup --- icechunk/src/change_set.rs | 3 +-- icechunk/src/session.rs | 10 ++++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index a8061e7dc..861df90fa 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -302,9 +302,8 @@ impl ChangeSet { None => Either::Left(iter::empty()), Some(h) => Either::Right( h.iter() - // FIXME: review this .filter(move |(manifest_extent, _)| { - extent.is_none() || Some(*manifest_extent) == extent.as_ref() + extent.as_ref().is_none_or(|e| e == *manifest_extent) }) .flat_map(|(_, manifest)| manifest.iter()), ), diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 383ed4d9d..d673cb6d3 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -524,7 +524,7 @@ impl Session { self.cache_splits(node_id, path, shape, dimension_names); } #[allow(clippy::expect_used)] - self.splits.get(node_id).expect("should not be possible.") + self.splits.get(node_id).expect("splits for node should always exist.") } // Helper function that accepts a NodeSnapshot instead of a path, @@ -540,8 +540,6 @@ impl Session { if shape.valid_chunk_coord(&coord) { let splits = self .get_splits(&node.id, &node.path, &shape, &dimension_names) - // FIXME: this clone is a workaround for two mutable borrows - // on self.change_set .clone(); self.change_set.set_chunk_ref(node.id, coord, data, &splits); Ok(()) @@ -1877,7 +1875,9 @@ async fn flush( continue; } - if flush_data.change_set.is_updated_array(node_id) + if + // metadata change might have shrunk the array + flush_data.change_set.is_updated_array(node_id) || flush_data.change_set.has_chunk_changes(node_id) { trace!(path=%node.path, "Node has changes, writing a new manifest"); @@ -1902,8 +1902,6 @@ async fn flush( } else { trace!(path=%node.path, "Node has no changes, keeping the previous manifest"); // Array wasn't deleted but has no changes in this session - // FIXME: deal with the case of metadata shrinking an existing array, we should clear - // extra chunks that no longer fit in the array flush_data.copy_previous_manifest(&node, old_snapshot.as_ref()); } } From 2a6d65d82ef277c9e6718a25ef7876a303cd39ae Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 13 Jun 2025 12:41:27 -0600 Subject: [PATCH 27/43] More complex test + fixes --- icechunk/src/repository.rs | 199 +++++++++++++++++++++++++++++++------ icechunk/src/session.rs | 57 ++++++----- 2 files changed, 203 insertions(+), 53 deletions(-) diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs index 8dc2262f6..9896425d8 100644 --- a/icechunk/src/repository.rs +++ b/icechunk/src/repository.rs @@ -958,18 +958,31 @@ mod tests { use super::*; + fn ravel_multi_index<'a>(index: &[u32], shape: &[u32]) -> u32 { + index + .iter() + .zip(shape.iter()) + .rev() + .fold((0, 1), |(acc, stride), (index, size)| { + (acc + *index * stride, stride * *size) + }) + .0 + } + async fn assert_manifest_count( storage: &Arc, total_manifests: usize, ) { + let expected = storage + .list_manifests(&storage.default_settings()) + .await + .unwrap() + .count() + .await; assert_eq!( - storage - .list_manifests(&storage.default_settings()) - .await - .unwrap() - .count() - .await, - total_manifests + total_manifests, expected, + "Mismatch in manifest count: expected {}, but got {}", + expected, total_manifests, ); } @@ -1425,7 +1438,7 @@ mod tests { Ok(()) } - #[tokio::test] + #[tokio_test] async fn test_manifest_splitting_complex_config() -> Result<(), Box> { let shape = ArrayShape::new(vec![(25, 1), (10, 1), (3, 1), (4, 1)]).unwrap(); let dimension_names = Some(vec!["t".into(), "z".into(), "y".into(), "x".into()]); @@ -1490,9 +1503,10 @@ mod tests { Ok(()) } - #[tokio::test] + #[tokio_test] async fn test_manifest_splitting_complex_writes() -> Result<(), Box> { let t_split_size = 12u32; + let other_split_size = 9u32; let y_split_size = 2u32; let shape = ArrayShape::new(vec![(25, 1), (10, 1), (3, 1), (4, 1)]).unwrap(); @@ -1518,7 +1532,7 @@ mod tests { ManifestSplitCondition::NameMatches { regex: r".*".to_string() }, vec![ManifestSplitDim { condition: ManifestSplitDimCondition::Any, - num_chunks: 9, + num_chunks: other_split_size, }], ), ]; @@ -1537,6 +1551,7 @@ mod tests { Some(logging_c), ) .await?; + let repo_clone = repository.reopen(None, None)?; let mut total_manifests = 0; assert_manifest_count(&backend, total_manifests).await; @@ -1545,17 +1560,63 @@ mod tests { let ops = logging.fetch_operations(); assert!(ops.is_empty()); + let array_shape = + shape.iter().map(|x| x.array_length() as u32).collect::>(); + + let verify_data = async |ax, session: &Session| { + for i in 0..shape.get(ax).unwrap().array_length() { + let mut index = vec![0u32, 0, 0, 0]; + index[ax] = i as u32; + let ic = index.clone(); + let val = get_chunk( + session + .get_chunk_reader( + &temp_path, + &ChunkIndices(index), + &ByteRange::ALL, + ) + .await + .unwrap(), + ) + .await + .unwrap() + .expect(&format!("getting chunk ref failed for {:?}", &ic)); + let expected_value = + ravel_multi_index(ic.as_slice(), array_shape.as_slice()); + let expected = + Bytes::copy_from_slice(format!("{0}", expected_value).as_bytes()); + assert_eq!( + val, expected, + "For chunk {:?}, received {:?}, expected {:?}", + ic, val, expected + ); + } + }; + let verify_all_data = async |repo: &Repository| { + let session = repo + .readonly_session(&VersionInfo::BranchTipRef("main".to_string())) + .await + .unwrap(); + for ax in 0..shape.len() { + verify_data(ax, &session).await; + } + }; + + //========================================================= + // This loop iterates over axis and rewrites the boundary chunks. + // Each loop iteration must rewrite chunk_shape/split_size manifests for ax in 0..shape.len() { let mut session = repository.writable_session("main").await?; let axis_size = shape.get(ax).unwrap().array_length(); for i in 0..axis_size { let mut index = vec![0u32, 0, 0, 0]; index[ax] = i as u32; + let value = ravel_multi_index(index.as_slice(), array_shape.as_slice()); session .set_chunk_ref( temp_path.clone(), ChunkIndices(index), - Some(ChunkPayload::Inline(format!("{0}", i).into())), + Some(ChunkPayload::Inline(format!("{0}", value).into())), ) .await? } @@ -1565,25 +1626,107 @@ mod tests { session.commit(format!("finished axis {0}", ax).as_ref(), None).await?; assert_manifest_count(&backend, total_manifests).await; - for i in 0..shape.get(ax).unwrap().array_length() { - let mut index = vec![0u32, 0, 0, 0]; - index[ax] = i as u32; - let val = get_chunk( - session - .get_chunk_reader( - &temp_path, - &ChunkIndices(index), - &ByteRange::ALL, - ) - .await - .unwrap(), + verify_data(ax.clone(), &session).await; + } + verify_all_data(&repository).await; + + //========================================================= + // Now change splitting config + let split_sizes = vec![( + ManifestSplitCondition::AnyArray, + vec![ManifestSplitDim { + condition: ManifestSplitDimCondition::DimensionName("t".to_string()), + num_chunks: t_split_size, + }], + )]; + + let split_config = ManifestSplittingConfig { split_sizes: Some(split_sizes) }; + let man_config = ManifestConfig { + preload: Some(ManifestPreloadConfig { + max_total_refs: None, + preload_if: None, + }), + splitting: Some(split_config.clone()), + }; + let config = RepositoryConfig { + manifest: Some(man_config), + ..RepositoryConfig::default() + }; + let repository = repository.reopen(Some(config), None)?; + verify_all_data(&repository).await; + let mut session = repository.writable_session("main").await?; + let index = vec![13, 0, 0, 0]; + let value = ravel_multi_index(index.as_slice(), array_shape.as_slice()); + session + .set_chunk_ref( + temp_path.clone(), + ChunkIndices(index), + Some(ChunkPayload::Inline(format!("{0}", value).into())), + ) + .await?; + // Important: we only create one new manifest in this case for + // the first split in the `t`-axis. Since the other splits + // are not modified we preserve all the old manifests + total_manifests += 1; + session.commit(format!("finished time again").as_ref(), None).await?; + assert_manifest_count(&backend, total_manifests).await; + verify_all_data(&repository).await; + + // now modify all splits to trigger a full rewrite + let mut session = repository.writable_session("main").await?; + for idx in [0, 12, 24] { + let index = vec![idx, 0, 0, 0]; + let value = ravel_multi_index(index.as_slice(), array_shape.as_slice()); + session + .set_chunk_ref( + temp_path.clone(), + ChunkIndices(index), + Some(ChunkPayload::Inline(format!("{0}", value).into())), ) - .await - .unwrap() - .unwrap(); - assert_eq!(val, Bytes::copy_from_slice(format!("{0}", i).as_bytes())); - } + .await?; } + total_manifests += + (shape.get(0).unwrap().array_length() as u32).div_ceil(t_split_size) as usize; + session.commit(format!("finished time again").as_ref(), None).await?; + assert_manifest_count(&backend, total_manifests).await; + verify_all_data(&repository).await; + + //========================================================= + // Now get back to original repository with original config + // Modify one `t` split. + let mut session = repo_clone.writable_session("main").await?; + session + .set_chunk_ref( + temp_path.clone(), + ChunkIndices(vec![0, 0, 0, 0]), + Some(ChunkPayload::Inline(format!("{0}", 0).into())), + ) + .await?; + // Important: now we rewrite one split per dimension + total_manifests += 4 - 1; + session.commit(format!("finished time again").as_ref(), None).await?; + assert_manifest_count(&backend, total_manifests).await; + verify_all_data(&repo_clone).await; + verify_all_data(&repository).await; + + let mut session = repo_clone.writable_session("main").await?; + for idx in [0, 12, 24] { + let index = vec![idx, 0, 0, 0]; + let value = ravel_multi_index(index.as_slice(), array_shape.as_slice()); + session + .set_chunk_ref( + temp_path.clone(), + ChunkIndices(index), + Some(ChunkPayload::Inline(format!("{0}", value).into())), + ) + .await?; + } + total_manifests += + (shape.get(0).unwrap().array_length() as u32).div_ceil(t_split_size) as usize; + session.commit(format!("finished time again").as_ref(), None).await?; + assert_manifest_count(&backend, total_manifests).await; + verify_all_data(&repo_clone).await; + Ok(()) } diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index d673cb6d3..5ae5b96b7 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -1622,16 +1622,6 @@ impl<'a> FlushProcess<'a> { } } - fn finalize_refs( - &mut self, - node_id: &NodeId, - refs: HashMap, - ) -> SessionResult<()> { - for ref_ in refs.into_values() { - self.manifest_refs.entry(node_id.clone()).or_default().push(ref_); - } - Ok(()) - } /// Write a manifest for a node that was created in this session /// It doesn't need to look at previous manifests because the node is new async fn write_manifest_for_new_node( @@ -1643,9 +1633,6 @@ impl<'a> FlushProcess<'a> { let splits = self.splits.get(node_id).expect("getting split for node unexpectedly failed"); - let mut refs = - HashMap::::with_capacity(splits.len()); - // TODO: this could be try_fold with the refs HashMap as state for extent in splits.iter() { if self.change_set.array_manifest(node_id, extent).is_some() { @@ -1658,12 +1645,14 @@ impl<'a> FlushProcess<'a> { ) .map(Ok), ); - self.write_manifest_from_iterator(chunks) - .await? - .map(|new_ref| refs.insert(extent.clone(), new_ref)); + #[allow(clippy::expect_used)] + let new_ref = self.write_manifest_from_iterator(chunks).await?.expect( + "logic bug. for a new node, we must always write the manifest", + ); + self.manifest_refs.entry(node_id.clone()).or_default().push(new_ref); } } - self.finalize_refs(node_id, refs) + Ok(()) } /// Write a manifest for a node that was modified in this session @@ -1678,9 +1667,8 @@ impl<'a> FlushProcess<'a> { #[allow(clippy::expect_used)] let splits = self.splits.get(&node.id).expect("splits should exist for this node."); - // populate with existing refs, if they are compatible let mut refs = - HashMap::::with_capacity(splits.len()); + HashMap::>::with_capacity(splits.len()); let on_disk_extents = manifests.iter().map(|m| m.extents.clone()).collect::>(); @@ -1700,7 +1688,7 @@ impl<'a> FlushProcess<'a> { // this split was modified in this session, rewrite it completely self.write_manifest_for_updated_chunks(node, extent) .await? - .map(|new_ref| refs.insert(extent.clone(), new_ref)); + .map(|new_ref| refs.insert(extent.clone(), vec![new_ref])); } else { let on_disk_bbox = on_disk_extents .iter() @@ -1712,11 +1700,11 @@ impl<'a> FlushProcess<'a> { for old_ref in manifests.iter() { // Remember that the extents written to disk are the `from`:`to` ranges // of populated chunks - match &old_ref.extents.overlap_with(extent) { + match old_ref.extents.overlap_with(extent) { Overlap::Complete => { debug_assert!(on_disk_bbox.is_some()); // Just propagate this ref again, no rewriting necessary - refs.insert(extent.clone(), old_ref.clone()); + refs.entry(extent.clone()).or_default().push(old_ref.clone()); // OK to unwrap here since this manifest file must exist in the old snapshot #[allow(clippy::expect_used)] self.manifest_files.insert( @@ -1727,9 +1715,12 @@ impl<'a> FlushProcess<'a> { // the splits have changed, but no refs in this split have been written in this session // same as `if` block above debug_assert!(on_disk_bbox.is_some()); - self.write_manifest_for_updated_chunks(node, extent) + if let Some(new_ref) = self + .write_manifest_for_updated_chunks(node, extent) .await? - .map(|new_ref| refs.insert(extent.clone(), new_ref)); + { + refs.entry(extent.clone()).or_default().push(new_ref); + } } Overlap::None => { // Nothing to do @@ -1739,7 +1730,12 @@ impl<'a> FlushProcess<'a> { } } - self.finalize_refs(&node.id, refs)?; + // FIXME: Assert that bboxes in refs don't overlap + + self.manifest_refs + .entry(node.id.clone()) + .or_default() + .extend(refs.into_values().flatten()); Ok(()) } @@ -1913,6 +1909,17 @@ async fn flush( flush_data.write_manifest_for_new_node(node_id, node_path).await?; } + // manifest_files & manifest_refs _must_ be consistent + debug_assert_eq!( + flush_data.manifest_files.iter().map(|x| x.id.clone()).collect::>(), + flush_data + .manifest_refs + .values() + .flatten() + .map(|x| x.object_id.clone()) + .collect::>(), + ); + trace!("Building new snapshot"); // gather and sort nodes: // this is a requirement for Snapshot::from_iter From 27ad4bc4e8008b6243c9ed603cfbf0b8d93a0c81 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 13 Jun 2025 15:09:30 -0600 Subject: [PATCH 28/43] more test --- icechunk/src/repository.rs | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs index 9896425d8..4b43455fd 100644 --- a/icechunk/src/repository.rs +++ b/icechunk/src/repository.rs @@ -1703,7 +1703,7 @@ mod tests { ) .await?; // Important: now we rewrite one split per dimension - total_manifests += 4 - 1; + total_manifests += 3; session.commit(format!("finished time again").as_ref(), None).await?; assert_manifest_count(&backend, total_manifests).await; verify_all_data(&repo_clone).await; @@ -1727,6 +1727,39 @@ mod tests { assert_manifest_count(&backend, total_manifests).await; verify_all_data(&repo_clone).await; + // do that again, but with different values and test those specifically + let mut session = repo_clone.writable_session("main").await?; + for idx in [0, 12, 24] { + let index = vec![idx, 0, 0, 0]; + session + .set_chunk_ref( + temp_path.clone(), + ChunkIndices(index), + Some(ChunkPayload::Inline(format!("{0}", idx + 2).into())), + ) + .await?; + } + total_manifests += + (shape.get(0).unwrap().array_length() as u32).div_ceil(t_split_size) as usize; + session.commit(format!("finished time again").as_ref(), None).await?; + assert_manifest_count(&backend, total_manifests).await; + for idx in [0, 12, 24] { + let actual = get_chunk( + session + .get_chunk_reader( + &temp_path, + &ChunkIndices(vec![idx.clone(), 0, 0, 0]), + &ByteRange::ALL, + ) + .await + .unwrap(), + ) + .await + .unwrap() + .unwrap(); + let expected = Bytes::copy_from_slice(format!("{0}", idx + 2).as_bytes()); + assert_eq!(actual,expected); + } Ok(()) } From 14d70df4f358f5f681548935fd7e4369ce0b47bb Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 13 Jun 2025 16:39:20 -0600 Subject: [PATCH 29/43] Fix merging --- icechunk/src/change_set.rs | 8 +- icechunk/src/format/manifest.rs | 12 +++ icechunk/src/repository.rs | 155 +++++++++++++++++++++++++++++--- icechunk/src/session.rs | 11 +++ 4 files changed, 168 insertions(+), 18 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index 861df90fa..085f4f542 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -387,16 +387,16 @@ impl ChangeSet { // FIXME: do we even test this? self.deleted_chunks_outside_bounds.extend(other.deleted_chunks_outside_bounds); - for (node, other_splits) in other.set_chunks { + other.set_chunks.into_iter().for_each(|(node, other_splits)| { let manifests = self.set_chunks.entry(node).or_insert_with(|| { HashMap::::with_capacity( other_splits.len(), ) }); - for (extent, their_manifest) in other_splits { + other_splits.into_iter().for_each(|(extent, their_manifest)| { manifests.entry(extent).or_default().extend(their_manifest) - } - } + }) + }); } pub fn merge_many>(&mut self, others: T) { diff --git a/icechunk/src/format/manifest.rs b/icechunk/src/format/manifest.rs index 9e0ae5b55..446a5b6cc 100644 --- a/icechunk/src/format/manifest.rs +++ b/icechunk/src/format/manifest.rs @@ -164,6 +164,18 @@ impl ManifestSplits { pub fn len(&self) -> usize { self.0.len() } + + pub fn compatible_with(&self, other: &Self) -> bool { + for ours in self.iter() { + if any(other.iter(), |theirs| { + ours.overlap_with(theirs) == Overlap::Partial + || theirs.overlap_with(ours) == Overlap::Partial + }) { + return false; + } + } + true + } } /// Helper function for constructing uniformly spaced manifest split edges diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs index 4b43455fd..c91470469 100644 --- a/icechunk/src/repository.rs +++ b/icechunk/src/repository.rs @@ -936,6 +936,7 @@ mod tests { use std::{collections::HashMap, error::Error, path::PathBuf, sync::Arc}; use icechunk_macros::tokio_test; + use itertools::enumerate; use storage::logging::LoggingStorage; use tempfile::TempDir; @@ -1164,6 +1165,25 @@ mod tests { )); } + fn reopen_repo_with_new_splitting_config( + repo: &Repository, + split_sizes: Option)>>, + ) -> Repository { + let split_config = ManifestSplittingConfig { split_sizes }; + let man_config = ManifestConfig { + preload: Some(ManifestPreloadConfig { + max_total_refs: None, + preload_if: None, + }), + splitting: Some(split_config.clone()), + }; + let config = RepositoryConfig { + manifest: Some(man_config), + ..RepositoryConfig::default() + }; + repo.reopen(Some(config), None).unwrap() + } + async fn create_repo_with_split_manifest_config( path: &Path, shape: &ArrayShape, @@ -1640,19 +1660,8 @@ mod tests { }], )]; - let split_config = ManifestSplittingConfig { split_sizes: Some(split_sizes) }; - let man_config = ManifestConfig { - preload: Some(ManifestPreloadConfig { - max_total_refs: None, - preload_if: None, - }), - splitting: Some(split_config.clone()), - }; - let config = RepositoryConfig { - manifest: Some(man_config), - ..RepositoryConfig::default() - }; - let repository = repository.reopen(Some(config), None)?; + let repository = + reopen_repo_with_new_splitting_config(&repository, Some(split_sizes)); verify_all_data(&repository).await; let mut session = repository.writable_session("main").await?; let index = vec![13, 0, 0, 0]; @@ -1758,7 +1767,125 @@ mod tests { .unwrap() .unwrap(); let expected = Bytes::copy_from_slice(format!("{0}", idx + 2).as_bytes()); - assert_eq!(actual,expected); + assert_eq!(actual, expected); + } + Ok(()) + } + + #[tokio_test] + async fn test_manifest_splits_merge_sessions() -> Result<(), Box> { + let shape = ArrayShape::new(vec![(25, 1), (10, 1), (3, 1), (4, 1)]).unwrap(); + let dimension_names = Some(vec!["t".into(), "z".into(), "y".into(), "x".into()]); + let temp_path: Path = "/temperature".try_into().unwrap(); + + let orig_split_sizes = vec![( + ManifestSplitCondition::AnyArray, + vec![ManifestSplitDim { + condition: ManifestSplitDimCondition::DimensionName("t".to_string()), + num_chunks: 12u32, + }], + )]; + let split_config = + ManifestSplittingConfig { split_sizes: Some(orig_split_sizes.clone()) }; + let backend: Arc = new_in_memory_storage().await?; + let repository = create_repo_with_split_manifest_config( + &temp_path, + &shape, + &dimension_names, + &split_config, + Some(backend), + ) + .await?; + + let indices = + vec![vec![0, 0, 1, 0], vec![0, 0, 0, 0], vec![0, 2, 0, 0], vec![0, 2, 0, 1]]; + + let mut session1 = repository.writable_session("main").await?; + session1 + .set_chunk_ref( + temp_path.clone(), + ChunkIndices(indices[0].clone()), + Some(ChunkPayload::Inline(format!("{0}", 0).into())), + ) + .await?; + session1 + .set_chunk_ref( + temp_path.clone(), + ChunkIndices(indices[1].clone()), + Some(ChunkPayload::Inline(format!("{0}", 1).into())), + ) + .await?; + + for incompatible_size in [1, 11u32, 24u32, u32::MAX] { + let incompatible_split_sizes = vec![( + ManifestSplitCondition::AnyArray, + vec![ManifestSplitDim { + condition: ManifestSplitDimCondition::DimensionName("t".to_string()), + num_chunks: incompatible_size, + }], + )]; + let other_repo = reopen_repo_with_new_splitting_config( + &repository, + Some(incompatible_split_sizes), + ); + + assert_ne!(other_repo.config(), repository.config()); + + let mut session2 = other_repo.writable_session("main").await?; + session2 + .set_chunk_ref( + temp_path.clone(), + ChunkIndices(indices[2].clone()), + Some(ChunkPayload::Inline(format!("{0}", 2).into())), + ) + .await?; + session2 + .set_chunk_ref( + temp_path.clone(), + ChunkIndices(indices[3].clone()), + Some(ChunkPayload::Inline(format!("{0}", 3).into())), + ) + .await?; + + assert!(session1.merge(session2).await.is_err()); + } + + // now with the same split sizes + let other_repo = + reopen_repo_with_new_splitting_config(&repository, Some(orig_split_sizes)); + let mut session2 = other_repo.writable_session("main").await?; + session2 + .set_chunk_ref( + temp_path.clone(), + ChunkIndices(indices[2].clone()), + Some(ChunkPayload::Inline(format!("{0}", 2).into())), + ) + .await?; + session2 + .set_chunk_ref( + temp_path.clone(), + ChunkIndices(indices[3].clone()), + Some(ChunkPayload::Inline(format!("{0}", 3).into())), + ) + .await?; + + session1.merge(session2).await?; + for (val, idx) in enumerate(indices.iter()) { + let actual = get_chunk( + session1 + .get_chunk_reader( + &temp_path, + &ChunkIndices(idx.clone()), + &ByteRange::ALL, + ) + .await + .unwrap(), + ) + .await + .unwrap() + .expect(&format!("getting chunk ref failed for {:?}", &idx)); + let expected = Bytes::copy_from_slice(format!("{0}", val).as_bytes()); + assert_eq!(actual, expected); } Ok(()) } diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 5ae5b96b7..e56210fb8 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -109,6 +109,8 @@ pub enum SessionErrorKind { InvalidIndex { coords: ChunkIndices, path: Path }, #[error("invalid chunk index for splitting manifests: {coords:?}")] InvalidIndexForSplitManifests { coords: ChunkIndices }, + #[error("incompatible manifest splitting config when merging sessions")] + IncompatibleSplits, #[error("`to` snapshot ancestry doesn't include `from`")] BadSnapshotChainForDiff, } @@ -904,6 +906,15 @@ impl Session { return Err(SessionErrorKind::ReadOnlySession.into()); } let Session { splits, change_set, .. } = other; + + if self.splits.iter().any(|(node, our_splits)| { + splits + .get(node) + .is_some_and(|their_splits| !our_splits.compatible_with(their_splits)) + }) { + return Err(SessionErrorKind::IncompatibleSplits.into()); + } + self.splits.extend(splits); self.change_set.merge(change_set); Ok(()) From 27c7c4b9c21ff1210f2df487cadb3e338ad2d6ea Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 13 Jun 2025 17:01:31 -0600 Subject: [PATCH 30/43] reorg tests --- icechunk/src/format/manifest.rs | 253 ++++++++++++++++++++++++++++---- icechunk/src/session.rs | 186 +---------------------- 2 files changed, 223 insertions(+), 216 deletions(-) diff --git a/icechunk/src/format/manifest.rs b/icechunk/src/format/manifest.rs index 446a5b6cc..eea131b77 100644 --- a/icechunk/src/format/manifest.rs +++ b/icechunk/src/format/manifest.rs @@ -594,43 +594,232 @@ static ROOT_OPTIONS: VerifierOptions = VerifierOptions { #[allow(clippy::unwrap_used, clippy::panic)] mod tests { use super::*; - use crate::strategies::{ShapeDim, shapes_and_dims}; + use crate::strategies::{ShapeDim, manifest_extents, shapes_and_dims}; use icechunk_macros; + use itertools::{all, multizip}; + use proptest::collection::vec; use proptest::prelude::*; + use std::error::Error; + use test_strategy::proptest; + + #[proptest] + fn test_property_extents_set_ops_same( + #[strategy(manifest_extents(4))] e: ManifestExtents, + ) { + prop_assert_eq!(e.intersection(&e), Some(e.clone())); + prop_assert_eq!(e.union(&e), e.clone()); + prop_assert_eq!(e.overlap_with(&e), Overlap::Complete); + } + + #[proptest] + fn test_property_extents_set_ops( + #[strategy(manifest_extents(4))] e1: ManifestExtents, + #[strategy(manifest_extents(4))] e2: ManifestExtents, + ) { + let union = e1.union(&e2); + let intersection = e1.intersection(&e2); + + prop_assert_eq!(e1.intersection(&union), Some(e1.clone())); + prop_assert_eq!(union.intersection(&e1), Some(e1.clone())); + prop_assert_eq!(e2.intersection(&union), Some(e2.clone())); + prop_assert_eq!(union.intersection(&e2), Some(e2.clone())); + + // order is important for the next 2 + prop_assert_eq!(e1.overlap_with(&union), Overlap::Complete); + prop_assert_eq!(e2.overlap_with(&union), Overlap::Complete); + + if intersection.is_some() { + let int = intersection.unwrap(); + let expected = if e1 == e1 { Overlap::Complete } else { Overlap::Partial }; + prop_assert_eq!(int.overlap_with(&e1), expected.clone()); + prop_assert_eq!(int.overlap_with(&e2), expected); + } else { + prop_assert_eq!(e2.overlap_with(&e1), Overlap::None); + prop_assert_eq!(e1.overlap_with(&e2), Overlap::None); + } + } - proptest! { - #[icechunk_macros::test] - fn test_manifest_split_from_edges(shape_dim in shapes_and_dims(Some(5))) { - // Note: using the shape, chunks strategy to generate chunk_shape, split_shape - let ShapeDim { shape, .. } = shape_dim; - - let num_chunks = shape.iter().map(|x| x.array_length()).collect::>(); - let split_shape = shape.iter().map(|x| x.chunk_length()).collect::>(); - - let ndim = shape.len(); - let edges: Vec> = - (0usize..ndim).map(|axis| { - uniform_manifest_split_edges(num_chunks[axis] as u32, &(split_shape[axis] as u32)) - } - ).collect(); - - let splits = ManifestSplits::from_edges(edges.into_iter()); - for edge in splits.iter() { - // must be ndim ranges - prop_assert_eq!(edge.len(), ndim); - for range in edge.iter() { - prop_assert!(range.end > range.start); - } - } + #[proptest] + fn test_property_extents_widths( + #[strategy(manifest_extents(4))] extent1: ManifestExtents, + #[strategy(vec(0..100, 4))] delta_left: Vec, + #[strategy(vec(0..100, 4))] delta_right: Vec, + ) { + let widths = extent1.iter().map(|r| (r.end - r.start) as i32).collect::>(); + let extent2 = ManifestExtents::from_ranges_iter( + multizip((extent1.iter(), delta_left.iter(), delta_right.iter())).map( + |(extent, dleft, dright)| { + ((extent.start as i32 + dleft) as u32) + ..((extent.end as i32 + dright) as u32) + }, + ), + ); + + if all(delta_left.iter(), |elem| elem == &0i32) + && all(delta_right.iter(), |elem| elem == &0i32) + { + prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::Complete); + } + + let extent2 = ManifestExtents::from_ranges_iter( + multizip(( + extent1.iter(), + widths.iter(), + delta_left.iter(), + delta_right.iter(), + )) + .map(|(extent, width, dleft, dright)| { + let (low, high) = (dleft.min(dright), dleft.max(dright)); + ((extent.start as i32 + width + low) as u32) + ..((extent.end as i32 + width + high) as u32) + }), + ); + + prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::None); + + let extent2 = ManifestExtents::from_ranges_iter( + multizip(( + extent1.iter(), + widths.iter(), + delta_left.iter(), + delta_right.iter(), + )) + .map(|(extent, width, dleft, dright)| { + let (low, high) = (dleft.min(dright), dleft.max(dright)); + ((extent.start as i32 - width - high).max(0i32) as u32) + ..((extent.end as i32 - width - low) as u32) + }), + ); + prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::None); + + let extent2 = ManifestExtents::from_ranges_iter( + multizip((extent1.iter(), delta_left.iter(), delta_right.iter())).map( + |(extent, dleft, dright)| { + ((extent.start as i32 - dleft - 1).max(0i32) as u32) + ..((extent.end as i32 + dright + 1) as u32) + }, + ), + ); + prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::Partial); + } + + #[icechunk_macros::test] + fn test_overlaps() -> Result<(), Box> { + let e1 = ManifestExtents::new( + vec![0u32, 1, 2].as_slice(), + vec![2u32, 4, 6].as_slice(), + ); + + let e2 = ManifestExtents::new( + vec![10u32, 1, 2].as_slice(), + vec![12u32, 4, 6].as_slice(), + ); + + let union = ManifestExtents::new( + vec![0u32, 1, 2].as_slice(), + vec![12u32, 4, 6].as_slice(), + ); + + assert_eq!(e2.overlap_with(&e1), Overlap::None); + assert_eq!(e1.intersection(&e2), None); + assert_eq!(e1.union(&e2), union); - // when using from_edges, extents must not exactly overlap - for edges in splits.iter().combinations(2) { - let is_equal = std::iter::zip(edges[0].iter(), edges[1].iter()) - .all(|(range1, range2)| { - (range1.start == range2.start) && (range1.end == range2.end) - }); - prop_assert!(!is_equal); + let e1 = ManifestExtents::new( + vec![0u32, 1, 2].as_slice(), + vec![2u32, 4, 6].as_slice(), + ); + let e2 = ManifestExtents::new( + vec![2u32, 1, 2].as_slice(), + vec![42u32, 4, 6].as_slice(), + ); + assert_eq!(e2.overlap_with(&e1), Overlap::None); + assert_eq!(e1.overlap_with(&e2), Overlap::None); + + // asymmetric case + let e1 = ManifestExtents::new( + vec![0u32, 1, 2].as_slice(), + vec![3u32, 4, 6].as_slice(), + ); + let e2 = ManifestExtents::new( + vec![2u32, 1, 2].as_slice(), + vec![3u32, 4, 6].as_slice(), + ); + let union = ManifestExtents::new( + vec![0u32, 1, 2].as_slice(), + vec![3u32, 4, 6].as_slice(), + ); + let intersection = ManifestExtents::new( + vec![2u32, 1, 2].as_slice(), + vec![3u32, 4, 6].as_slice(), + ); + assert_eq!(e2.overlap_with(&e1), Overlap::Complete); + assert_eq!(e1.overlap_with(&e2), Overlap::Partial); + assert_eq!(e1.union(&e2), union.clone()); + assert_eq!(e2.union(&e1), union.clone()); + assert_eq!(e1.intersection(&e2), Some(intersection)); + + // empty set + let e1 = ManifestExtents::new( + vec![0u32, 1, 2].as_slice(), + vec![3u32, 4, 6].as_slice(), + ); + let e2 = ManifestExtents::new( + vec![2u32, 1, 2].as_slice(), + vec![2u32, 4, 6].as_slice(), + ); + assert_eq!(e1.intersection(&e2), None); + + // this should create non-overlapping extents + let splits = ManifestSplits::from_edges(vec![ + vec![0, 10, 20], + vec![0, 1, 2], + vec![0, 21, 22], + ]); + for vec in splits.iter().combinations(2) { + assert_eq!(vec[0].overlap_with(vec[1]), Overlap::None); + assert_eq!(vec[1].overlap_with(vec[0]), Overlap::None); + } + + Ok(()) + } + + #[proptest] + fn test_manifest_split_from_edges( + #[strategy(shapes_and_dims(Some(5)))] shape_dim: ShapeDim, + ) { + // Note: using the shape, chunks strategy to generate chunk_shape, split_shape + let ShapeDim { shape, .. } = shape_dim; + + let num_chunks = shape.iter().map(|x| x.array_length()).collect::>(); + let split_shape = shape.iter().map(|x| x.chunk_length()).collect::>(); + + let ndim = shape.len(); + let edges: Vec> = (0usize..ndim) + .map(|axis| { + uniform_manifest_split_edges( + num_chunks[axis] as u32, + &(split_shape[axis] as u32), + ) + }) + .collect(); + + let splits = ManifestSplits::from_edges(edges.into_iter()); + for edge in splits.iter() { + // must be ndim ranges + prop_assert_eq!(edge.len(), ndim); + for range in edge.iter() { + prop_assert!(range.end > range.start); } } + + // when using from_edges, extents must not exactly overlap + for edges in splits.iter().combinations(2) { + let is_equal = std::iter::zip(edges[0].iter(), edges[1].iter()).all( + |(range1, range2)| { + (range1.start == range2.start) && (range1.end == range2.end) + }, + ); + prop_assert!(!is_equal); + } } } diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index e56210fb8..0f11af6c3 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -2167,16 +2167,14 @@ mod tests { repository::VersionInfo, storage::new_in_memory_storage, strategies::{ - ShapeDim, chunk_indices, empty_writable_session, manifest_extents, - node_paths, shapes_and_dims, + ShapeDim, chunk_indices, empty_writable_session, node_paths, shapes_and_dims, }, }; use super::*; use icechunk_macros::tokio_test; - use itertools::{Itertools, all, multizip}; + use itertools::Itertools; use pretty_assertions::assert_eq; - use proptest::collection::vec; use proptest::prelude::{prop_assert, prop_assert_eq}; use storage::logging::LoggingStorage; use test_strategy::proptest; @@ -2336,186 +2334,6 @@ mod tests { prop_assert!(session.delete_group(path.clone()).await.is_ok()); } - #[proptest] - fn test_property_extents_set_ops_same( - #[strategy(manifest_extents(4))] e: ManifestExtents, - ) { - prop_assert_eq!(e.intersection(&e), Some(e.clone())); - prop_assert_eq!(e.union(&e), e.clone()); - prop_assert_eq!(e.overlap_with(&e), Overlap::Complete); - } - - #[proptest] - fn test_property_extents_set_ops( - #[strategy(manifest_extents(4))] e1: ManifestExtents, - #[strategy(manifest_extents(4))] e2: ManifestExtents, - ) { - let union = e1.union(&e2); - let intersection = e1.intersection(&e2); - - prop_assert_eq!(e1.intersection(&union), Some(e1.clone())); - prop_assert_eq!(union.intersection(&e1), Some(e1.clone())); - prop_assert_eq!(e2.intersection(&union), Some(e2.clone())); - prop_assert_eq!(union.intersection(&e2), Some(e2.clone())); - - // order is important for the next 2 - prop_assert_eq!(e1.overlap_with(&union), Overlap::Complete); - prop_assert_eq!(e2.overlap_with(&union), Overlap::Complete); - - if intersection.is_some() { - let int = intersection.unwrap(); - let expected = if e1 == e1 { Overlap::Complete } else { Overlap::Partial }; - prop_assert_eq!(int.overlap_with(&e1), expected.clone()); - prop_assert_eq!(int.overlap_with(&e2), expected); - } else { - prop_assert_eq!(e2.overlap_with(&e1), Overlap::None); - prop_assert_eq!(e1.overlap_with(&e2), Overlap::None); - } - } - - #[proptest] - fn test_property_extents_widths( - #[strategy(manifest_extents(4))] extent1: ManifestExtents, - #[strategy(vec(0..100, 4))] delta_left: Vec, - #[strategy(vec(0..100, 4))] delta_right: Vec, - ) { - let widths = extent1.iter().map(|r| (r.end - r.start) as i32).collect::>(); - let extent2 = ManifestExtents::from_ranges_iter( - multizip((extent1.iter(), delta_left.iter(), delta_right.iter())).map( - |(extent, dleft, dright)| { - ((extent.start as i32 + dleft) as u32) - ..((extent.end as i32 + dright) as u32) - }, - ), - ); - - if all(delta_left.iter(), |elem| elem == &0i32) - && all(delta_right.iter(), |elem| elem == &0i32) - { - prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::Complete); - } - - let extent2 = ManifestExtents::from_ranges_iter( - multizip(( - extent1.iter(), - widths.iter(), - delta_left.iter(), - delta_right.iter(), - )) - .map(|(extent, width, dleft, dright)| { - let (low, high) = (dleft.min(dright), dleft.max(dright)); - ((extent.start as i32 + width + low) as u32) - ..((extent.end as i32 + width + high) as u32) - }), - ); - - prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::None); - - let extent2 = ManifestExtents::from_ranges_iter( - multizip(( - extent1.iter(), - widths.iter(), - delta_left.iter(), - delta_right.iter(), - )) - .map(|(extent, width, dleft, dright)| { - let (low, high) = (dleft.min(dright), dleft.max(dright)); - ((extent.start as i32 - width - high).max(0i32) as u32) - ..((extent.end as i32 - width - low) as u32) - }), - ); - prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::None); - - let extent2 = ManifestExtents::from_ranges_iter( - multizip((extent1.iter(), delta_left.iter(), delta_right.iter())).map( - |(extent, dleft, dright)| { - ((extent.start as i32 - dleft - 1).max(0i32) as u32) - ..((extent.end as i32 + dright + 1) as u32) - }, - ), - ); - prop_assert_eq!(extent2.overlap_with(&extent1), Overlap::Partial); - } - - #[icechunk_macros::test] - fn test_overlaps() -> Result<(), Box> { - let e1 = ManifestExtents::new( - vec![0u32, 1, 2].as_slice(), - vec![2u32, 4, 6].as_slice(), - ); - - let e2 = ManifestExtents::new( - vec![10u32, 1, 2].as_slice(), - vec![12u32, 4, 6].as_slice(), - ); - - let union = ManifestExtents::new( - vec![0u32, 1, 2].as_slice(), - vec![12u32, 4, 6].as_slice(), - ); - - assert_eq!(e2.overlap_with(&e1), Overlap::None); - assert_eq!(e1.intersection(&e2), None); - assert_eq!(e1.union(&e2), union); - - let e1 = ManifestExtents::new( - vec![0u32, 1, 2].as_slice(), - vec![2u32, 4, 6].as_slice(), - ); - let e2 = ManifestExtents::new( - vec![2u32, 1, 2].as_slice(), - vec![42u32, 4, 6].as_slice(), - ); - assert_eq!(e2.overlap_with(&e1), Overlap::None); - assert_eq!(e1.overlap_with(&e2), Overlap::None); - - // asymmetric case - let e1 = ManifestExtents::new( - vec![0u32, 1, 2].as_slice(), - vec![3u32, 4, 6].as_slice(), - ); - let e2 = ManifestExtents::new( - vec![2u32, 1, 2].as_slice(), - vec![3u32, 4, 6].as_slice(), - ); - let union = ManifestExtents::new( - vec![0u32, 1, 2].as_slice(), - vec![3u32, 4, 6].as_slice(), - ); - let intersection = ManifestExtents::new( - vec![2u32, 1, 2].as_slice(), - vec![3u32, 4, 6].as_slice(), - ); - assert_eq!(e2.overlap_with(&e1), Overlap::Complete); - assert_eq!(e1.overlap_with(&e2), Overlap::Partial); - assert_eq!(e1.union(&e2), union.clone()); - assert_eq!(e2.union(&e1), union.clone()); - assert_eq!(e1.intersection(&e2), Some(intersection)); - - // empty set - let e1 = ManifestExtents::new( - vec![0u32, 1, 2].as_slice(), - vec![3u32, 4, 6].as_slice(), - ); - let e2 = ManifestExtents::new( - vec![2u32, 1, 2].as_slice(), - vec![2u32, 4, 6].as_slice(), - ); - assert_eq!(e1.intersection(&e2), None); - - // this should create non-overlapping extents - let splits = ManifestSplits::from_edges(vec![ - vec![0, 10, 20], - vec![0, 1, 2], - vec![0, 21, 22], - ]); - for vec in splits.iter().combinations(2) { - assert_eq!(vec[0].overlap_with(vec[1]), Overlap::None); - assert_eq!(vec[1].overlap_with(vec[0]), Overlap::None); - } - - Ok(()) - } #[proptest(async = "tokio")] async fn test_aggregate_extents( #[strategy(proptest::collection::vec(chunk_indices(3, 0..1_000_000), 1..50))] From 9e1c8adc2da1420f22739bff9a36613f1f2ff903 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 13 Jun 2025 17:22:32 -0600 Subject: [PATCH 31/43] edits --- icechunk/src/change_set.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index 085f4f542..04a0ddb6e 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -130,6 +130,7 @@ impl ChangeSet { } // update existing splits + let mut to_remove = HashSet::::new(); if let Some(manifests) = self.set_chunks.remove(node_id) { let mut new_deleted_chunks = HashSet::::new(); let mut new_manifests = @@ -165,15 +166,18 @@ impl ChangeSet { } // bring back any previously tracked deletes - if let Some(deletes) = self.deleted_chunks_outside_bounds.get(node_id) { + if let Some(deletes) = self.deleted_chunks_outside_bounds.get_mut(node_id) { for coord in deletes.iter() { if let Some(extents) = new_splits.find(coord) { new_manifests .entry(extents) .or_default() .insert(coord.clone(), None); + to_remove.insert(coord.clone()); }; } + deletes.retain(|item| !to_remove.contains(item)); + to_remove.drain(); }; self.set_chunks.insert(node_id.clone(), new_manifests); @@ -242,16 +246,16 @@ impl ChangeSet { // this implementation makes delete idempotent // it allows deleting a deleted chunk by repeatedly setting None. self.set_chunks - .entry(node_id.clone()) + .entry(node_id) .or_insert_with(|| { HashMap::< ManifestExtents, BTreeMap>, >::with_capacity(splits.len()) }) - .entry(extent.clone()) + .entry(extent) .or_default() - .insert(coord.clone(), data.clone()); + .insert(coord, data); } pub fn get_chunk_ref( From e974f483da600fb88a54838f813e9e0d16aae810 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 16 Jun 2025 13:22:30 -0600 Subject: [PATCH 32/43] Add conflicting commits test --- icechunk/src/change_set.rs | 1 - icechunk/src/repository.rs | 162 ++++++++++++++++++++++++++++++++++++- 2 files changed, 160 insertions(+), 3 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index 04a0ddb6e..1e0a13ca8 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -77,7 +77,6 @@ impl ChangeSet { } pub fn arrays_with_chunk_changes(&self) -> impl Iterator { - // FIXME: needs test for session with only chunk deletes on existing nodes self.set_chunks.keys() } diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs index c91470469..0c1d7f7ef 100644 --- a/icechunk/src/repository.rs +++ b/icechunk/src/repository.rs @@ -933,7 +933,7 @@ pub async fn raise_if_invalid_snapshot_id( #[cfg(test)] #[allow(clippy::panic, clippy::unwrap_used, clippy::expect_used)] mod tests { - use std::{collections::HashMap, error::Error, path::PathBuf, sync::Arc}; + use std::{collections::HashMap, error::Error, iter::zip, path::PathBuf, sync::Arc}; use icechunk_macros::tokio_test; use itertools::enumerate; @@ -947,13 +947,14 @@ mod tests { ManifestSplitDim, ManifestSplitDimCondition, ManifestSplittingConfig, RepositoryConfig, }, + conflicts::basic_solver::BasicConflictSolver, format::{ ByteRange, ChunkIndices, manifest::{ChunkPayload, ManifestSplits}, snapshot::{ArrayShape, DimensionName}, }, new_local_filesystem_storage, - session::get_chunk, + session::{SessionError, get_chunk}, storage::new_in_memory_storage, }; @@ -1887,6 +1888,163 @@ mod tests { let expected = Bytes::copy_from_slice(format!("{0}", val).as_bytes()); assert_eq!(actual, expected); } + + // now merge two sessions: one with only writes, one with only deletes + let mut session1 = repository.writable_session("main").await?; + session1 + .set_chunk_ref( + temp_path.clone(), + ChunkIndices(indices[0].clone()), + Some(ChunkPayload::Inline(format!("{0}", 3).into())), + ) + .await?; + session1 + .set_chunk_ref( + temp_path.clone(), + ChunkIndices(indices[1].clone()), + Some(ChunkPayload::Inline(format!("{0}", 4).into())), + ) + .await?; + let mut session2 = repository.writable_session("main").await?; + session2 + .set_chunk_ref(temp_path.clone(), ChunkIndices(indices[2].clone()), None) + .await?; + session2 + .set_chunk_ref(temp_path.clone(), ChunkIndices(indices[3].clone()), None) + .await?; + + session1.merge(session2).await?; + let expected = vec![Some(3), Some(4), None, None]; + for (expect, idx) in zip(expected.iter(), indices.iter()) { + let actual = get_chunk( + session1 + .get_chunk_reader( + &temp_path, + &ChunkIndices(idx.clone()), + &ByteRange::ALL, + ) + .await + .unwrap(), + ) + .await + .unwrap(); + let expected_value = + expect.map(|val| Bytes::copy_from_slice(format!("{0}", val).as_bytes())); + assert_eq!(actual, expected_value); + } + + Ok(()) + } + + #[tokio_test] + async fn test_commits_with_conflicting_manifest_splits() -> Result<(), Box> + { + let shape = ArrayShape::new(vec![(25, 1), (10, 1), (3, 1), (4, 1)]).unwrap(); + let dimension_names = Some(vec!["t".into(), "z".into(), "y".into(), "x".into()]); + let temp_path: Path = "/temperature".try_into().unwrap(); + + let orig_split_sizes = vec![( + ManifestSplitCondition::AnyArray, + vec![ManifestSplitDim { + condition: ManifestSplitDimCondition::DimensionName("t".to_string()), + num_chunks: 12u32, + }], + )]; + let split_config = + ManifestSplittingConfig { split_sizes: Some(orig_split_sizes.clone()) }; + let backend: Arc = new_in_memory_storage().await?; + let repository = create_repo_with_split_manifest_config( + &temp_path, + &shape, + &dimension_names, + &split_config, + Some(backend), + ) + .await?; + + let indices = + vec![vec![0, 0, 1, 0], vec![0, 0, 0, 0], vec![0, 2, 0, 0], vec![0, 2, 0, 1]]; + + let mut session1 = repository.writable_session("main").await?; + session1 + .set_chunk_ref( + temp_path.clone(), + ChunkIndices(indices[0].clone()), + Some(ChunkPayload::Inline(format!("{0}", 0).into())), + ) + .await?; + session1 + .set_chunk_ref( + temp_path.clone(), + ChunkIndices(indices[1].clone()), + Some(ChunkPayload::Inline(format!("{0}", 1).into())), + ) + .await?; + + let incompatible_size = 11u32; + let incompatible_split_sizes = vec![( + ManifestSplitCondition::AnyArray, + vec![ManifestSplitDim { + condition: ManifestSplitDimCondition::DimensionName("t".to_string()), + num_chunks: incompatible_size, + }], + )]; + let other_repo = reopen_repo_with_new_splitting_config( + &repository, + Some(incompatible_split_sizes), + ); + + assert_ne!(other_repo.config(), repository.config()); + + let mut session2 = other_repo.writable_session("main").await?; + session2 + .set_chunk_ref( + temp_path.clone(), + ChunkIndices(indices[2].clone()), + Some(ChunkPayload::Inline(format!("{0}", 2).into())), + ) + .await?; + session2 + .set_chunk_ref( + temp_path.clone(), + ChunkIndices(indices[3].clone()), + Some(ChunkPayload::Inline(format!("{0}", 3).into())), + ) + .await?; + + session1.commit("first commit", None).await?; + if let Err(SessionError { kind: SessionErrorKind::Conflict { .. }, .. }) = + session2.commit("second commit", None).await + { + let solver = BasicConflictSolver::default(); + // different chunks were written so this should fast forward + assert!(session2.rebase(&solver).await.is_ok()); + session2.commit("second commit after rebase", None).await?; + } else { + panic!("this should have conflicted!"); + } + + let new_session = repository + .readonly_session(&VersionInfo::BranchTipRef("main".into())) + .await?; + for (val, idx) in enumerate(indices.iter()) { + let actual = get_chunk( + new_session + .get_chunk_reader( + &temp_path, + &ChunkIndices(idx.clone()), + &ByteRange::ALL, + ) + .await + .unwrap(), + ) + .await + .unwrap() + .expect(&format!("getting chunk ref failed for {:?}", &idx)); + let expected = Bytes::copy_from_slice(format!("{0}", val).as_bytes()); + assert_eq!(actual, expected); + } + Ok(()) } From 76e58fd14d25bacaf07e18c16c7475c36d6f801e Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 16 Jun 2025 14:10:25 -0600 Subject: [PATCH 33/43] Add test for splits changing in a session --- icechunk/src/repository.rs | 127 ++++++++++++++++++++++++++++++++++++- icechunk/src/session.rs | 7 +- 2 files changed, 130 insertions(+), 4 deletions(-) diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs index 0c1d7f7ef..62997aa94 100644 --- a/icechunk/src/repository.rs +++ b/icechunk/src/repository.rs @@ -1262,7 +1262,8 @@ mod tests { assert_manifest_count(&storage, 1).await; // Important we are not issuing any chunk deletes here (which is what Zarr does) - // Note we are still rewriting the manifest + // Note we are still rewriting the manifest even without chunk changes + // GH604 let mut session = repo.writable_session("main").await?; let shape2 = ArrayShape::new(vec![(2, 1)]).unwrap(); session @@ -1291,12 +1292,132 @@ mod tests { session.commit("second commit", None).await?; assert_manifest_count(&storage, 2).await; - // FIXME: add more complex splitting test + Ok(()) + } + + #[tokio_test] + async fn test_splits_change_in_session() -> Result<(), Box> { + let shape = ArrayShape::new(vec![(13, 1), (2, 1), (1, 1)]).unwrap(); + let dimension_names = Some(vec!["t".into(), "y".into(), "x".into()]); + let new_dimension_names = Some(vec!["time".into(), "y".into(), "x".into()]); + let array_path: Path = "/temperature".try_into().unwrap(); + let array_def = Bytes::from_static(br#"{"this":"other array"}"#); + + // two possible split sizes t: 3, time: 4; + // then we rename `t` to `time` 😈 + let split_sizes = vec![ + ( + ManifestSplitCondition::PathMatches { regex: r".*".to_string() }, + vec![ManifestSplitDim { + condition: ManifestSplitDimCondition::DimensionName( + "^t$".to_string(), + ), + num_chunks: 3, + }], + ), + ( + ManifestSplitCondition::PathMatches { regex: r".*".to_string() }, + vec![ManifestSplitDim { + condition: ManifestSplitDimCondition::DimensionName( + "time".to_string(), + ), + num_chunks: 4, + }], + ), + ]; + let split_config = ManifestSplittingConfig { split_sizes: Some(split_sizes) }; + + let backend: Arc = new_in_memory_storage().await?; + let logging = Arc::new(LoggingStorage::new(Arc::clone(&backend))); + let storage: Arc = logging.clone(); + let repository = create_repo_with_split_manifest_config( + &array_path, + &shape, + &dimension_names, + &split_config, + Some(Arc::clone(&storage)), + ) + .await?; + + let verify_data = async |session: &Session, offset: u32| { + for idx in 0..12 { + let actual = get_chunk( + session + .get_chunk_reader( + &array_path, + &ChunkIndices(vec![idx.clone(), 0, 0]), + &ByteRange::ALL, + ) + .await + .unwrap(), + ) + .await + .unwrap() + .unwrap(); + let expected = Bytes::copy_from_slice(format!("{0}", idx + offset).as_bytes()); + assert_eq!(actual, expected); + } + }; + + let mut session = repository.writable_session("main").await?; + for i in 0..12 { + session + .set_chunk_ref( + array_path.clone(), + ChunkIndices(vec![i, 0, 0]), + Some(ChunkPayload::Inline(format!("{0}", i).into())), + ) + .await? + } + verify_data(&session, 0).await; + + let node = session.get_node(&array_path).await?; + let orig_splits = session.lookup_splits(&node.id).cloned(); + assert_eq!( + orig_splits, + Some(ManifestSplits::from_edges(vec![ + vec![0, 3, 6, 9, 12, 13], + vec![0, 2], + vec![0, 1] + ])) + ); + + // this should update the splits + session + .update_array( + &array_path, + shape.clone(), + new_dimension_names.clone(), + array_def.clone(), + ) + .await?; + verify_data(&session, 0).await; + let new_splits = session.lookup_splits(&node.id).cloned(); + assert_eq!( + new_splits, + Some(ManifestSplits::from_edges(vec![ + vec![0, 4, 8, 12, 13], + vec![0, 2], + vec![0, 1] + ])) + ); + + // update data + for i in 0..12 { + session + .set_chunk_ref( + array_path.clone(), + ChunkIndices(vec![i, 0, 0]), + Some(ChunkPayload::Inline(format!("{0}", i+10).into())), + ) + .await? + } + verify_data(&session, 10).await; Ok(()) } - #[tokio::test] + #[tokio_test] async fn tests_manifest_splitting_simple() -> Result<(), Box> { let dim_size = 25u32; let chunk_size = 1u32; diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 0f11af6c3..57952ec91 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -500,6 +500,10 @@ impl Session { self.set_node_chunk_ref(node_snapshot, coord, data).await } + pub fn lookup_splits(&self, node_id: &NodeId) -> Option<&ManifestSplits> { + self.splits.get(node_id) + } + fn cache_splits( &mut self, node_id: &NodeId, @@ -507,9 +511,9 @@ impl Session { shape: &ArrayShape, dimension_names: &Option>, ) { - // FIXME: handle conflicts here // Q: What happens if we set a chunk, then change a dimension name, so // that the split changes. + // A: we reorg the existing chunk refs in the changeset to the new splits. let splitting = self.config.manifest().splitting(); let splits = splitting.get_split_sizes(path, shape, dimension_names); self.splits.insert(node_id.clone(), splits); @@ -1842,6 +1846,7 @@ impl ManifestSplittingConfig { } in dim_specs.iter() { if dim_condition.matches(axis, dimname.clone().into()) { + dbg!(&dim_condition, "matched", &dimname); edges[axis] = uniform_manifest_split_edges( num_chunks[axis], split_size, From a306abfdfe531d584f0cb9dffa88e0f377f864a9 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 16 Jun 2025 15:19:28 -0600 Subject: [PATCH 34/43] lint --- icechunk/src/repository.rs | 5 +++-- icechunk/src/session.rs | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs index 62997aa94..316acaf34 100644 --- a/icechunk/src/repository.rs +++ b/icechunk/src/repository.rs @@ -1354,7 +1354,8 @@ mod tests { .await .unwrap() .unwrap(); - let expected = Bytes::copy_from_slice(format!("{0}", idx + offset).as_bytes()); + let expected = + Bytes::copy_from_slice(format!("{0}", idx + offset).as_bytes()); assert_eq!(actual, expected); } }; @@ -1408,7 +1409,7 @@ mod tests { .set_chunk_ref( array_path.clone(), ChunkIndices(vec![i, 0, 0]), - Some(ChunkPayload::Inline(format!("{0}", i+10).into())), + Some(ChunkPayload::Inline(format!("{0}", i + 10).into())), ) .await? } diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 57952ec91..77d5b372a 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -1846,7 +1846,6 @@ impl ManifestSplittingConfig { } in dim_specs.iter() { if dim_condition.matches(axis, dimname.clone().into()) { - dbg!(&dim_condition, "matched", &dimname); edges[axis] = uniform_manifest_split_edges( num_chunks[axis], split_size, From e719c7f43ee1dcac36e4bf33a796ba83fdb3b639 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 17 Jun 2025 13:51:58 -0600 Subject: [PATCH 35/43] Review comments --- icechunk/src/change_set.rs | 9 ++-- icechunk/src/format/manifest.rs | 3 ++ icechunk/src/repository.rs | 6 +++ icechunk/src/session.rs | 82 ++++++++++++++++++--------------- icechunk/tests/test_gc.rs | 3 ++ 5 files changed, 62 insertions(+), 41 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index 1e0a13ca8..25eb8e513 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -169,7 +169,7 @@ impl ChangeSet { for coord in deletes.iter() { if let Some(extents) = new_splits.find(coord) { new_manifests - .entry(extents) + .entry(extents.clone()) .or_default() .insert(coord.clone(), None); to_remove.insert(coord.clone()); @@ -252,7 +252,7 @@ impl ChangeSet { BTreeMap>, >::with_capacity(splits.len()) }) - .entry(extent) + .entry(extent.clone()) .or_default() .insert(coord, data); } @@ -263,8 +263,9 @@ impl ChangeSet { coords: &ChunkIndices, ) -> Option<&Option> { self.set_chunks.get(node_id).and_then(|node_chunks| { - find_coord(node_chunks.keys(), coords) - .and_then(|extent| node_chunks.get(&extent).and_then(|s| s.get(coords))) + find_coord(node_chunks.keys(), coords).and_then(|(_, extent)| { + node_chunks.get(extent).and_then(|s| s.get(coords)) + }) }) } diff --git a/icechunk/src/format/manifest.rs b/icechunk/src/format/manifest.rs index eea131b77..d01efae11 100644 --- a/icechunk/src/format/manifest.rs +++ b/icechunk/src/format/manifest.rs @@ -166,6 +166,9 @@ impl ManifestSplits { } pub fn compatible_with(&self, other: &Self) -> bool { + // this is not a simple zip + all(equals) because + // ordering might differ though both sets of splits + // must be complete. for ours in self.iter() { if any(other.iter(), |theirs| { ours.overlap_with(theirs) == Overlap::Partial diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs index 316acaf34..9f86530f6 100644 --- a/icechunk/src/repository.rs +++ b/icechunk/src/repository.rs @@ -1924,6 +1924,7 @@ mod tests { vec![vec![0, 0, 1, 0], vec![0, 0, 0, 0], vec![0, 2, 0, 0], vec![0, 2, 0, 1]]; let mut session1 = repository.writable_session("main").await?; + let node_id = session1.get_node(&temp_path).await?.id; session1 .set_chunk_ref( temp_path.clone(), @@ -1992,7 +1993,12 @@ mod tests { ) .await?; + // Session.splits should be _complete_ so it should be identical for the same node + // on any two sessions with compatible splits + let splits = session1.lookup_splits(&node_id).unwrap().clone(); + assert_eq!(session1.lookup_splits(&node_id), session2.lookup_splits(&node_id)); session1.merge(session2).await?; + assert_eq!(session1.lookup_splits(&node_id), Some(&splits)); for (val, idx) in enumerate(indices.iter()) { let actual = get_chunk( session1 diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 77d5b372a..0576b96da 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -109,10 +109,15 @@ pub enum SessionErrorKind { InvalidIndex { coords: ChunkIndices, path: Path }, #[error("invalid chunk index for splitting manifests: {coords:?}")] InvalidIndexForSplitManifests { coords: ChunkIndices }, - #[error("incompatible manifest splitting config when merging sessions")] - IncompatibleSplits, + #[error("incompatible manifest splitting config when merging two sessions")] + IncompatibleSplittingConfig { + ours: ManifestSplittingConfig, + theirs: ManifestSplittingConfig, + }, #[error("`to` snapshot ancestry doesn't include `from`")] BadSnapshotChainForDiff, + #[error("failed to create manifest from chunk stream")] + ManifestCreationError(#[from] Box), } pub type SessionError = ICError; @@ -168,7 +173,10 @@ pub type SessionResult = Result; // and at read time to choose which manifest to query for chunk payload /// It is useful to have this act on an iterator (e.g. get_chunk_ref) /// The find method on ManifestSplits is simply a helper. -pub fn find_coord<'a, I>(mut iter: I, coord: &'a ChunkIndices) -> Option +pub fn find_coord<'a, I>( + iter: I, + coord: &'a ChunkIndices, +) -> Option<(usize, &'a ManifestExtents)> where I: Iterator, { @@ -182,25 +190,18 @@ where // ndim must be the same // Note: I don't think we can distinguish between out of bounds index for the array // and an index that is part of a split that hasn't been written yet. - iter.find(|e| e.contains(coord.0.as_slice())).cloned() -} - -pub fn position_coord<'a, I>(iter: I, coord: &'a ChunkIndices) -> Option -where - I: Iterator, -{ - enumerate(iter).find(|(_, e)| e.contains(coord.0.as_slice())).map(|x| x.0) + enumerate(iter).find(|(_, e)| e.contains(coord.0.as_slice())) } impl ManifestSplits { - pub fn find(&self, coord: &ChunkIndices) -> Option { + pub fn find<'a>(&'a self, coord: &'a ChunkIndices) -> Option<&'a ManifestExtents> { debug_assert_eq!(coord.0.len(), self.0[0].len()); - find_coord(self.iter(), coord) + find_coord(self.iter(), coord).map(|x| x.1) } pub fn position(&self, coord: &ChunkIndices) -> Option { debug_assert_eq!(coord.0.len(), self.0[0].len()); - position_coord(self.iter(), coord) + find_coord(self.iter(), coord).map(|x| x.0) } } @@ -238,6 +239,8 @@ impl Session { snapshot_id, change_set: ChangeSet::default(), default_commit_metadata: SnapshotProperties::default(), + // Splits are populated for a node during + // `add_array`, `update_array`, and `set_chunk_ref` splits: Default::default(), } } @@ -504,6 +507,8 @@ impl Session { self.splits.get(node_id) } + /// This method is directly called in add_array & update_array + /// where we know we must update the splits HashMap fn cache_splits( &mut self, node_id: &NodeId, @@ -526,11 +531,13 @@ impl Session { shape: &ArrayShape, dimension_names: &Option>, ) -> &ManifestSplits { - if !self.splits.contains_key(node_id) { - self.cache_splits(node_id, path, shape, dimension_names); - } - #[allow(clippy::expect_used)] - self.splits.get(node_id).expect("splits for node should always exist.") + self.splits.entry(node_id.clone()).or_insert_with(|| { + self.config.manifest().splitting().get_split_sizes( + path, + shape, + dimension_names, + ) + }) } // Helper function that accepts a NodeSnapshot instead of a path, @@ -810,8 +817,8 @@ impl Session { return Ok(None); } - let index = match position_coord(manifests.iter().map(|m| &m.extents), coords) { - Some(index) => index, + let index = match find_coord(manifests.iter().map(|m| &m.extents), coords) { + Some((index, _)) => index, // for an invalid coordinate, we bail. // This happens for two cases: // (1) the "coords" is out-of-range for the array shape @@ -909,17 +916,24 @@ impl Session { if self.read_only() { return Err(SessionErrorKind::ReadOnlySession.into()); } - let Session { splits, change_set, .. } = other; + let Session { splits: other_splits, change_set, .. } = other; if self.splits.iter().any(|(node, our_splits)| { - splits + other_splits .get(node) .is_some_and(|their_splits| !our_splits.compatible_with(their_splits)) }) { - return Err(SessionErrorKind::IncompatibleSplits.into()); + let ours = self.config().manifest().splitting().clone(); + let theirs = self.config().manifest().splitting().clone(); + return Err( + SessionErrorKind::IncompatibleSplittingConfig { ours, theirs }.into() + ); } - self.splits.extend(splits); + // Session.splits is _complete_ in that it will include every possible split. + // So a simple `extend` is fine, if the same node appears in two sessions, + // it must have the same splits and overwriting is fine. + self.splits.extend(other_splits); self.change_set.merge(change_set); Ok(()) } @@ -1331,7 +1345,6 @@ async fn verified_node_chunk_iterator<'a>( payload, }); - // FIXME: I don't understand this let old_chunks = change_set .update_existing_chunks( node_id_c3, old_chunks, @@ -1614,10 +1627,9 @@ impl<'a> FlushProcess<'a> { let mut to = vec![]; let chunks = aggregate_extents(&mut from, &mut to, chunks, |ci| &ci.coord); - #[allow(clippy::expect_used)] if let Some(new_manifest) = Manifest::from_stream(chunks) .await - .expect("failed to create manifest from chunk stream") + .map_err(|e| SessionErrorKind::ManifestCreationError(Box::new(e)))? { let new_manifest = Arc::new(new_manifest); let new_manifest_size = @@ -1648,7 +1660,6 @@ impl<'a> FlushProcess<'a> { let splits = self.splits.get(node_id).expect("getting split for node unexpectedly failed"); - // TODO: this could be try_fold with the refs HashMap as state for extent in splits.iter() { if self.change_set.array_manifest(node_id, extent).is_some() { let chunks = stream::iter( @@ -1676,7 +1687,7 @@ impl<'a> FlushProcess<'a> { async fn write_manifest_for_existing_node( &mut self, node: &NodeSnapshot, - manifests: Vec, + existing_manifests: Vec, old_snapshot: &Snapshot, ) -> SessionResult<()> { #[allow(clippy::expect_used)] @@ -1686,18 +1697,17 @@ impl<'a> FlushProcess<'a> { HashMap::>::with_capacity(splits.len()); let on_disk_extents = - manifests.iter().map(|m| m.extents.clone()).collect::>(); + existing_manifests.iter().map(|m| m.extents.clone()).collect::>(); let modified_splits = self .change_set .modified_manifest_extents_iterator(&node.id, &node.path) .collect::>(); - // FIXME: there is an invariant here // ``modified_splits`` (i.e. splits used in this session) // must be a subset of ``splits`` (the splits set in the config) + debug_assert!(modified_splits.is_subset(&splits.iter().collect::>())); - // TODO: this should be try_fold with the refs HashMap as state for extent in splits.iter() { if modified_splits.contains(extent) { // this split was modified in this session, rewrite it completely @@ -1705,6 +1715,7 @@ impl<'a> FlushProcess<'a> { .await? .map(|new_ref| refs.insert(extent.clone(), vec![new_ref])); } else { + // intersection of the current split with extents on disk let on_disk_bbox = on_disk_extents .iter() .filter_map(|e| e.intersection(extent)) @@ -1712,7 +1723,7 @@ impl<'a> FlushProcess<'a> { // split was unmodified in this session. Let's look at the current manifests // and see what we need to do with them - for old_ref in manifests.iter() { + for old_ref in existing_manifests.iter() { // Remember that the extents written to disk are the `from`:`to` ranges // of populated chunks match old_ref.extents.overlap_with(extent) { @@ -3330,10 +3341,8 @@ mod tests { ds.add_array(a2path.clone(), shape.clone(), dimension_names.clone(), def.clone()) .await?; - dbg!("added arrays, now commit"); let _ = ds.commit("first commit", None).await?; - dbg!("committed arrays"); // there should be no manifests yet because we didn't add any chunks assert_eq!( 0, @@ -3358,7 +3367,6 @@ mod tests { let mut ds = repo.writable_session("main").await?; - dbg!("setting chunk ref"); // add 3 chunks ds.set_chunk_ref( a1path.clone(), diff --git a/icechunk/tests/test_gc.rs b/icechunk/tests/test_gc.rs index 82da26a64..ac5d53535 100644 --- a/icechunk/tests/test_gc.rs +++ b/icechunk/tests/test_gc.rs @@ -67,6 +67,7 @@ pub async fn do_test_gc( let storage_settings = storage.default_settings(); let shape = ArrayShape::new(vec![(1100, 1)]).unwrap(); + // intentionally small to create garbage let manifest_split_size = 10; let split_sizes = Some(vec![( ManifestSplitCondition::PathMatches { regex: r".*".to_string() }, @@ -113,6 +114,7 @@ pub async fn do_test_gc( let mut ds = repo.writable_session("main").await?; // overwrite 10 chunks + // This will only overwrite one split manifest. for idx in 0..10 { let bytes = Bytes::copy_from_slice(&0i8.to_be_bytes()); let payload = ds.get_chunk_writer()(bytes.clone()).await?; @@ -167,6 +169,7 @@ pub async fn do_test_gc( ) .await?; assert_eq!(summary.chunks_deleted, 10); + // only one manifest was re-created, so there is only one garbage manifest assert_eq!(summary.manifests_deleted, 1); assert_eq!(summary.snapshots_deleted, 1); assert!(summary.bytes_deleted > summary.chunks_deleted); From b7fb3855d81091c38c3bd80033c807f5801cefc3 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 17 Jun 2025 14:18:59 -0600 Subject: [PATCH 36/43] Use ManifestExtents::ALL sentinel --- icechunk/src/change_set.rs | 10 ++++------ icechunk/src/format/manifest.rs | 22 ++++++++++++++++++++- icechunk/src/session.rs | 35 +++++++++++++-------------------- 3 files changed, 39 insertions(+), 28 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index 25eb8e513..444c031b9 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -297,7 +297,7 @@ impl ChangeSet { &self, node_id: &NodeId, node_path: &Path, - extent: Option, + extent: ManifestExtents, ) -> impl Iterator)> + use<'_> { if self.is_deleted(node_path, node_id) { return Either::Left(iter::empty()); @@ -306,9 +306,7 @@ impl ChangeSet { None => Either::Left(iter::empty()), Some(h) => Either::Right( h.iter() - .filter(move |(manifest_extent, _)| { - extent.as_ref().is_none_or(|e| e == *manifest_extent) - }) + .filter(move |(manifest_extent, _)| extent.matches(manifest_extent)) .flat_map(|(_, manifest)| manifest.iter()), ), } @@ -318,7 +316,7 @@ impl ChangeSet { &self, ) -> impl Iterator + use<'_> { self.new_arrays.iter().flat_map(|(path, (node_id, _))| { - self.new_array_chunk_iterator(node_id, path, None) + self.new_array_chunk_iterator(node_id, path, ManifestExtents::ALL) .map(|ci| (path.clone(), ci)) }) } @@ -327,7 +325,7 @@ impl ChangeSet { &'a self, node_id: &'a NodeId, node_path: &Path, - extent: Option, + extent: ManifestExtents, ) -> impl Iterator + use<'a> { self.array_chunks_iterator(node_id, node_path, extent).filter_map( move |(coords, payload)| { diff --git a/icechunk/src/format/manifest.rs b/icechunk/src/format/manifest.rs index d01efae11..37fe3d400 100644 --- a/icechunk/src/format/manifest.rs +++ b/icechunk/src/format/manifest.rs @@ -36,6 +36,9 @@ pub enum Overlap { pub struct ManifestExtents(Vec>); impl ManifestExtents { + // sentinel for a "universal set" + pub const ALL: Self = Self(Vec::new()); + pub fn new(from: &[u32], to: &[u32]) -> Self { let v = from .iter() @@ -66,6 +69,10 @@ impl ManifestExtents { } pub fn intersection(&self, other: &Self) -> Option { + if self == &Self::ALL { + return Some(other.clone()); + } + debug_assert_eq!(self.len(), other.len()); let ranges = zip(self.iter(), other.iter()) .map(|(a, b)| max(a.start, b.start)..min(a.end, b.end)) @@ -74,6 +81,9 @@ impl ManifestExtents { } pub fn union(&self, other: &Self) -> Self { + if self == &Self::ALL { + return Self::ALL; + } debug_assert_eq!(self.len(), other.len()); Self::from_ranges_iter( zip(self.iter(), other.iter()) @@ -83,13 +93,17 @@ impl ManifestExtents { pub fn overlap_with(&self, other: &Self) -> Overlap { // Important: this is not symmetric. + if *other == Self::ALL { + return Overlap::Complete; + } else if *self == Self::ALL { + return Overlap::Partial; + } debug_assert!( self.len() == other.len(), "Length mismatch: self = {:?}, other = {:?}", &self, &other ); - let mut overlap = Overlap::Complete; for (a, b) in zip(other.iter(), self.iter()) { debug_assert!(a.start <= a.end, "Invalid range: {:?}", a.clone()); @@ -102,6 +116,12 @@ impl ManifestExtents { } overlap } + + pub fn matches(&self, other: &ManifestExtents) -> bool { + // used in `.filter` + // ALL always matches any other extents + if *self == Self::ALL { true } else { self == other } + } } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 0576b96da..8690bb404 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -628,7 +628,7 @@ impl Session { &self.change_set, &self.snapshot_id, path, - None, + ManifestExtents::ALL, ) .await } @@ -870,7 +870,7 @@ impl Session { &self.change_set, &self.snapshot_id, node.clone(), - None, + ManifestExtents::ALL, ) .await .map_ok(|(_path, chunk_info)| chunk_info.coord); @@ -878,7 +878,7 @@ impl Session { let res = try_stream! { let new_chunks = stream::iter( self.change_set - .new_array_chunk_iterator(&node.id, array_path, None) + .new_array_chunk_iterator(&node.id, array_path, ManifestExtents::ALL) .map(|chunk_info| Ok::(chunk_info.coord)), ); @@ -1210,7 +1210,7 @@ async fn updated_chunk_iterator<'a>( change_set, snapshot_id, node, - None, + ManifestExtents::ALL, ) .await) }); @@ -1222,7 +1222,7 @@ async fn updated_node_chunks_iterator<'a>( change_set: &'a ChangeSet, snapshot_id: &'a SnapshotId, node: NodeSnapshot, - extent: Option, + extent: ManifestExtents, ) -> impl Stream> + 'a { // This iterator should yield chunks for existing arrays + any updates. // we check for deletion here in the case that `path` exists in the snapshot, @@ -1252,7 +1252,7 @@ async fn node_chunk_iterator<'a>( change_set: &'a ChangeSet, snapshot_id: &'a SnapshotId, path: &Path, - extent: Option, + extent: ManifestExtents, ) -> impl Stream> + 'a + use<'a> { match get_node(asset_manager, change_set, snapshot_id, path).await { Ok(node) => futures::future::Either::Left( @@ -1275,7 +1275,7 @@ async fn verified_node_chunk_iterator<'a>( snapshot_id: &'a SnapshotId, change_set: &'a ChangeSet, node: NodeSnapshot, - extent: Option, + extent: ManifestExtents, ) -> impl Stream> + 'a { match node.node_data { NodeData::Group => futures::future::Either::Left(futures::stream::empty()), @@ -1308,9 +1308,10 @@ async fn verified_node_chunk_iterator<'a>( futures::stream::iter(new_chunks).chain( futures::stream::iter(manifests) .filter(move |manifest_ref| { - futures::future::ready(extent.as_ref().is_none_or(|e| { - e.overlap_with(&manifest_ref.extents) != Overlap::None - })) + futures::future::ready( + extent.overlap_with(&manifest_ref.extents) + != Overlap::None, + ) }) .then(move |manifest_ref| { let new_chunk_indices = new_chunk_indices.clone(); @@ -1333,11 +1334,7 @@ async fn verified_node_chunk_iterator<'a>( !new_chunk_indices.contains(coord) // If the manifest we are parsing partially overlaps with `extent`, // we need to filter all coordinates - && extent_c2.as_ref().is_none_or( - move |e| { - e.contains(coord.0.as_slice()) - }, - ) + && extent_c2.contains(coord.0.as_slice()) }) .map_ok(move |(coord, payload)| ChunkInfo { node: node_id_c2.clone(), @@ -1612,7 +1609,7 @@ impl<'a> FlushProcess<'a> { self.change_set, self.parent_id, node.clone(), - Some(extent.clone()), + extent.clone(), ) .await .map_ok(|(_path, chunk_info)| chunk_info); @@ -1664,11 +1661,7 @@ impl<'a> FlushProcess<'a> { if self.change_set.array_manifest(node_id, extent).is_some() { let chunks = stream::iter( self.change_set - .new_array_chunk_iterator( - node_id, - node_path, - Some(extent.clone()), - ) + .new_array_chunk_iterator(node_id, node_path, extent.clone()) .map(Ok), ); #[allow(clippy::expect_used)] From 4a2e455f80d9b0c0fc2d8fc7e159b7b6bddbe8c2 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 18 Jun 2025 16:06:32 -0600 Subject: [PATCH 37/43] Update stateful tests --- .../tests/test_zarr/test_stateful.py | 178 +++++++++++++++--- icechunk/src/repository.rs | 40 ++++ icechunk/src/session.rs | 7 +- 3 files changed, 199 insertions(+), 26 deletions(-) diff --git a/icechunk-python/tests/test_zarr/test_stateful.py b/icechunk-python/tests/test_zarr/test_stateful.py index 17e1c7f85..f94938f85 100644 --- a/icechunk-python/tests/test_zarr/test_stateful.py +++ b/icechunk-python/tests/test_zarr/test_stateful.py @@ -1,4 +1,5 @@ import json +from collections.abc import Iterable from typing import Any import hypothesis.extra.numpy as npst @@ -14,8 +15,9 @@ run_state_machine_as_test, ) +import icechunk as ic import zarr -from icechunk import Repository, in_memory_storage +from icechunk import Repository, Storage, in_memory_storage from zarr.core.buffer import default_buffer_prototype from zarr.testing.stateful import ZarrHierarchyStateMachine from zarr.testing.strategies import ( @@ -23,10 +25,64 @@ node_names, np_array_and_chunks, numpy_arrays, + orthogonal_indices, ) PROTOTYPE = default_buffer_prototype() +pytestmark = [ + pytest.mark.filterwarnings( + "ignore::zarr.core.dtype.common.UnstableSpecificationWarning" + ), +] + + +import functools + + +def with_frequency(frequency): + """ + Decorator to control how frequently a rule runs in Hypothesis stateful tests. + + Args: + frequency: Float between 0 and 1, where 1.0 means always run, + 0.1 means run ~10% of the time, etc. + + Usage: + @rule() + @with_frequency(0.1) # Run ~10% of the time + def rare_operation(self): + pass + """ + + def decorator(func): + # Create a counter attribute name specific to this function + counter_attr = f"__{func.__name__}_counter" + + @functools.wraps(func) + def wrapper(self, *args, **kwargs): + return func(self, *args, **kwargs) + + # Add precondition that checks frequency + @precondition + def frequency_check(self): + # Initialize counter if it doesn't exist + if not hasattr(self, counter_attr): + setattr(self, counter_attr, 0) + + # Increment counter + current_count = getattr(self, counter_attr) + 1 + setattr(self, counter_attr, current_count) + + # Check if we should run based on frequency + # This gives roughly the right frequency over many calls + return (current_count * frequency) % 1.0 >= (1.0 - frequency) + + # Apply the precondition to the wrapped function + return frequency_check(wrapper) + + return decorator + @st.composite def chunk_paths( @@ -41,14 +97,66 @@ def chunk_paths( return "/".join(map(str, blockidx[subset_slicer])) +@st.composite +def splitting_configs( + draw: st.DrawFn, *, arrays: Iterable[zarr.Array] +) -> ic.ManifestSplittingConfig: + config_dict = {} + for array in arrays: + if draw(st.booleans()): + array_condition = ic.ManifestSplitCondition.name_matches( + array.path.split("/")[-1] + ) + else: + array_condition = ic.ManifestSplitCondition.path_matches(array.path) + dimnames = array.metadata.dimension_names or (None,) * array.ndim + dimsize_axis_names = draw( + st.lists( + st.sampled_from( + tuple(zip(array.shape, range(array.ndim), dimnames, strict=False)) + ), + min_size=1, + unique=True, + ) + ) + for size, axis, dimname in dimsize_axis_names: + if dimname is None or draw(st.booleans()): + key = ic.ManifestSplitDimCondition.Axis(axis) + else: + key = ic.ManifestSplitDimCondition.DimensionName(dimname) + config_dict[array_condition] = { + key: draw(st.integers(min_value=1, max_value=size + 10)) + } + return ic.ManifestSplittingConfig.from_dict(config_dict) + + # TODO: more before/after commit invariants? # TODO: add "/" to self.all_groups, deleting "/" seems to be problematic class ModifiedZarrHierarchyStateMachine(ZarrHierarchyStateMachine): - def __init__(self, repo: Repository) -> None: - self.repo = repo - store = repo.writable_session("main").store + def __init__(self, storage: Storage) -> None: + self.storage = storage + self.repo = Repository.create(self.storage) + store = self.repo.writable_session("main").store super().__init__(store) + @precondition( + lambda self: not self.store.session.has_uncommitted_changes + and bool(self.all_arrays) + ) + @rule(data=st.data()) + def reopen_with_config(self, data): + array_paths = data.draw( + st.lists(st.sampled_from(sorted(self.all_arrays)), max_size=3, unique=True) + ) + arrays = tuple(zarr.open_array(self.model, path=path) for path in array_paths) + sconfig = data.draw(splitting_configs(arrays=arrays)) + config = ic.RepositoryConfig( + inline_chunk_threshold_bytes=0, manifest=ic.ManifestConfig(splitting=sconfig) + ) + note(f"reopening with splitting config {sconfig=!r}") + self.repo = Repository.open(self.storage, config=config) + self.store = self.repo.writable_session("main").store + @precondition(lambda self: self.store.session.has_uncommitted_changes) @rule(data=st.data()) def commit_with_check(self, data) -> None: @@ -110,8 +218,49 @@ def add_array( assume(array.size > 0) super().add_array(data, name, array_and_chunks) + @precondition(lambda self: bool(self.all_groups)) + @rule(data=st.data()) + def check_list_dir(self, data: st.DataObject) -> None: + path = self.draw_directory(data) + note(f"list_dir for {path=!r}") + model_ls = sorted(self._sync_iter(self.model.list_dir(path))) + store_ls = sorted(self._sync_iter(self.store.list_dir(path))) + if model_ls != store_ls and set(model_ls).symmetric_difference(set(store_ls)) != { + "c" + }: + # Consider .list_dir("path/to/array") for an array with a single chunk. + # The MemoryStore model will return `"c", "zarr.json"` only if the chunk exists + # If that chunk was deleted, then `"c"` is not returned. + # LocalStore will not have this behaviour :/ + # In Icechunk, we always return the `c` so ignore this inconsistency. + assert model_ls == store_ls, (model_ls, store_ls) + ##### TODO: port everything below to zarr + @precondition(lambda self: bool(self.all_arrays)) + @rule(data=st.data()) + def check_array(self, data: st.DataObject) -> None: + path = data.draw(st.sampled_from(sorted(self.all_arrays))) + actual = zarr.open_array(self.store, path=path)[:] + expected = zarr.open_array(self.model, path=path)[:] + np.testing.assert_equal(actual, expected) + + @precondition(lambda self: bool(self.all_arrays)) + @rule(data=st.data()) + def overwrite_array_orthogonal_indexing(self, data: st.DataObject) -> None: + array = data.draw(st.sampled_from(sorted(self.all_arrays))) + model_array = zarr.open_array(path=array, store=self.model) + store_array = zarr.open_array(path=array, store=self.store) + indexer, _ = data.draw(orthogonal_indices(shape=model_array.shape)) + note(f"overwriting array orthogonal {indexer=}") + new_data = data.draw( + npst.arrays(shape=model_array.oindex[indexer].shape, dtype=model_array.dtype) + ) + model_array.oindex[indexer] = new_data + store_array.oindex[indexer] = new_data + + ##### TODO: delete after next Zarr release (Jun 18, 2025) @rule() + @with_frequency(0.25) def clear(self) -> None: note("clearing") import zarr @@ -154,23 +303,6 @@ def draw_directory(self, data) -> str: path = array_or_group return path - @precondition(lambda self: bool(self.all_groups)) - @rule(data=st.data()) - def check_list_dir(self, data) -> None: - path = self.draw_directory(data) - note(f"list_dir for {path=!r}") - model_ls = sorted(self._sync_iter(self.model.list_dir(path))) - store_ls = sorted(self._sync_iter(self.store.list_dir(path))) - if model_ls != store_ls and set(model_ls).symmetric_difference(set(store_ls)) != { - "c" - }: - # Consider .list_dir("path/to/array") for an array with a single chunk. - # The MemoryStore model will return `"c", "zarr.json"` only if the chunk exists - # If that chunk was deleted, then `"c"` is not returned. - # LocalStore will not have this behaviour :/ - # In Icechunk, we always return the `c` so ignore this inconsistency. - assert model_ls == store_ls, (model_ls, store_ls) - @precondition(lambda self: bool(self.all_arrays)) @rule(data=st.data()) def delete_chunk(self, data) -> None: @@ -247,10 +379,8 @@ def check_list_prefix_from_root(self) -> None: def test_zarr_hierarchy() -> None: - repo = Repository.create(in_memory_storage()) - def mk_test_instance_sync() -> ModifiedZarrHierarchyStateMachine: - return ModifiedZarrHierarchyStateMachine(repo) + return ModifiedZarrHierarchyStateMachine(in_memory_storage()) run_state_machine_as_test( mk_test_instance_sync, settings=Settings(report_multiple_bugs=False) diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs index 9f86530f6..919c0848d 100644 --- a/icechunk/src/repository.rs +++ b/icechunk/src/repository.rs @@ -1578,6 +1578,46 @@ mod tests { assert_eq!(val, Bytes::copy_from_slice(format!("{0}", i).as_bytes())); } + // delete all chunks + let mut session = repository.writable_session("main").await?; + for i in 0..dim_size { + session + .set_chunk_ref(temp_path.clone(), ChunkIndices(vec![i, 0, 0]), None) + .await?; + } + total_manifests += 0; + session.commit("clear existing array", None).await?; + assert_manifest_count(&storage, total_manifests).await; + + // add a new array + let def = Bytes::from_static(br#"{"this":"array"}"#); + let array_path: Path = "/array2".to_string().try_into().unwrap(); + let mut session = repository.writable_session("main").await?; + session + .add_array( + array_path.clone(), + shape.clone(), + dimension_names.clone(), + def.clone(), + ) + .await?; + // set a chunk + session + .set_chunk_ref( + array_path.clone(), + ChunkIndices(vec![1, 0, 0]), + Some(ChunkPayload::Inline(format!("{0}", 10).into())), + ) + .await?; + // delete that chunk, so the chunks iterator is empty + // regression test for bug found by hypothesis + session + .set_chunk_ref(array_path.clone(), ChunkIndices(vec![1, 0, 0]), None) + .await?; + total_manifests += 0; + session.commit("clear new array", None).await?; + assert_manifest_count(&storage, total_manifests).await; + Ok(()) } diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 8690bb404..1e0777b04 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -1665,10 +1665,13 @@ impl<'a> FlushProcess<'a> { .map(Ok), ); #[allow(clippy::expect_used)] - let new_ref = self.write_manifest_from_iterator(chunks).await?.expect( + let new_ref = self.write_manifest_from_iterator(chunks).await.expect( "logic bug. for a new node, we must always write the manifest", ); - self.manifest_refs.entry(node_id.clone()).or_default().push(new_ref); + // new_ref is None if there were no chunks in the iterator + if let Some(new_ref) = new_ref { + self.manifest_refs.entry(node_id.clone()).or_default().push(new_ref); + } } } Ok(()) From b179dbc390fdfc71bef5e5df8f0c4ed2d85402af Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 20 Jun 2025 14:29:30 -0600 Subject: [PATCH 38/43] update benchmarks dep group --- icechunk-python/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/icechunk-python/pyproject.toml b/icechunk-python/pyproject.toml index 478bf3a0e..ee2db15b6 100644 --- a/icechunk-python/pyproject.toml +++ b/icechunk-python/pyproject.toml @@ -51,6 +51,7 @@ benchmark = [ "humanize", "platformdirs", "ipdb", + "coiled", ] docs = [ "scipy", From e184e2998a54632db113fee87762ad2fff57df3f Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 20 Jun 2025 16:14:28 -0600 Subject: [PATCH 39/43] small edits --- icechunk-python/benchmarks/conftest.py | 10 +++++----- icechunk-python/tests/test_zarr/test_stateful.py | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/icechunk-python/benchmarks/conftest.py b/icechunk-python/benchmarks/conftest.py index d45e2b300..2617b1c20 100644 --- a/icechunk-python/benchmarks/conftest.py +++ b/icechunk-python/benchmarks/conftest.py @@ -65,11 +65,11 @@ def large_write_dataset(request) -> BenchmarkWriteDataset: @pytest.fixture( params=[ - # pytest.param(GB_8MB_CHUNKS, id="gb-8mb"), - # pytest.param(GB_128MB_CHUNKS, id="gb-128mb"), - # pytest.param(ERA5_SINGLE, id="era5-single"), - # pytest.param(ERA5, id="era5-weatherbench"), - # pytest.param(ERA5_ARCO, id="era5-arco"), + pytest.param(GB_8MB_CHUNKS, id="gb-8mb"), + pytest.param(GB_128MB_CHUNKS, id="gb-128mb"), + pytest.param(ERA5_SINGLE, id="era5-single"), + pytest.param(ERA5, id="era5-weatherbench"), + pytest.param(ERA5_ARCO, id="era5-arco"), pytest.param(LARGE_MANIFEST_UNSHARDED, id="large-manifest-no-split"), pytest.param( LARGE_MANIFEST_SHARDED, diff --git a/icechunk-python/tests/test_zarr/test_stateful.py b/icechunk-python/tests/test_zarr/test_stateful.py index f94938f85..3ad392b69 100644 --- a/icechunk-python/tests/test_zarr/test_stateful.py +++ b/icechunk-python/tests/test_zarr/test_stateful.py @@ -30,11 +30,11 @@ PROTOTYPE = default_buffer_prototype() -pytestmark = [ - pytest.mark.filterwarnings( - "ignore::zarr.core.dtype.common.UnstableSpecificationWarning" - ), -] +# pytestmark = [ +# pytest.mark.filterwarnings( +# "ignore::zarr.core.dtype.common.UnstableSpecificationWarning" +# ), +# ] import functools From 3b522597aa9c4ceecc5ae8a3097ebf24064faa41 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sat, 21 Jun 2025 17:35:58 -0600 Subject: [PATCH 40/43] Parallel writes --- icechunk/src/session.rs | 189 +++++++++++++++++++++++----------------- 1 file changed, 107 insertions(+), 82 deletions(-) diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 1e0777b04..a7e8650c5 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -12,7 +12,9 @@ use async_stream::try_stream; use bytes::Bytes; use chrono::{DateTime, Utc}; use err_into::ErrorInto; -use futures::{FutureExt, Stream, StreamExt, TryStreamExt, future::Either, stream}; +use futures::{ + FutureExt, Stream, StreamExt, TryStream, TryStreamExt, future::Either, stream, +}; use itertools::{Itertools as _, enumerate, repeat_n}; use regex::bytes::Regex; use serde::{Deserialize, Serialize}; @@ -1572,6 +1574,40 @@ pub fn construct_valid_byte_range( } } +async fn write_manifest_from_iterator( + asset_manager: &AssetManager, + chunks: impl Stream>, +) -> SessionResult> { + let mut from = vec![]; + let mut to = vec![]; + let chunks = aggregate_extents(&mut from, &mut to, chunks, |ci| &ci.coord); + + if let Some(new_manifest) = Manifest::from_stream(chunks) + .await + .map_err(|e| SessionErrorKind::ManifestCreationError(Box::new(e)))? + { + let new_manifest = Arc::new(new_manifest); + let new_manifest_size = + asset_manager.write_manifest(Arc::clone(&new_manifest)).await?; + + let file_info = ManifestFileInfo::new(new_manifest.as_ref(), new_manifest_size); + + let new_ref = ManifestRef { + object_id: new_manifest.id().clone(), + extents: ManifestExtents::new(&from, &to), + }; + Ok(Some((new_ref, file_info))) + } else { + Ok(None) + } +} + +fn map_ok_second( + stream: impl TryStream, Ok = (A, B), Error = E>, +) -> impl TryStream, Ok = B, Error = E> { + stream.map_ok(|(_, second)| second) +} + struct FlushProcess<'a> { asset_manager: Arc, change_set: &'a ChangeSet, @@ -1598,54 +1634,6 @@ impl<'a> FlushProcess<'a> { } } - async fn write_manifest_for_updated_chunks( - &mut self, - node: &NodeSnapshot, - extent: &ManifestExtents, - ) -> SessionResult> { - let asset_manager = Arc::clone(&self.asset_manager); - let updated_chunks = updated_node_chunks_iterator( - asset_manager.as_ref(), - self.change_set, - self.parent_id, - node.clone(), - extent.clone(), - ) - .await - .map_ok(|(_path, chunk_info)| chunk_info); - self.write_manifest_from_iterator(updated_chunks).await - } - - async fn write_manifest_from_iterator( - &mut self, - chunks: impl Stream>, - ) -> SessionResult> { - let mut from = vec![]; - let mut to = vec![]; - let chunks = aggregate_extents(&mut from, &mut to, chunks, |ci| &ci.coord); - - if let Some(new_manifest) = Manifest::from_stream(chunks) - .await - .map_err(|e| SessionErrorKind::ManifestCreationError(Box::new(e)))? - { - let new_manifest = Arc::new(new_manifest); - let new_manifest_size = - self.asset_manager.write_manifest(Arc::clone(&new_manifest)).await?; - - let file_info = - ManifestFileInfo::new(new_manifest.as_ref(), new_manifest_size); - self.manifest_files.insert(file_info); - - let new_ref = ManifestRef { - object_id: new_manifest.id().clone(), - extents: ManifestExtents::new(&from, &to), - }; - Ok(Some(new_ref)) - } else { - Ok(None) - } - } - /// Write a manifest for a node that was created in this session /// It doesn't need to look at previous manifests because the node is new async fn write_manifest_for_new_node( @@ -1657,23 +1645,38 @@ impl<'a> FlushProcess<'a> { let splits = self.splits.get(node_id).expect("getting split for node unexpectedly failed"); - for extent in splits.iter() { - if self.change_set.array_manifest(node_id, extent).is_some() { - let chunks = stream::iter( - self.change_set - .new_array_chunk_iterator(node_id, node_path, extent.clone()) - .map(Ok), - ); - #[allow(clippy::expect_used)] - let new_ref = self.write_manifest_from_iterator(chunks).await.expect( - "logic bug. for a new node, we must always write the manifest", - ); - // new_ref is None if there were no chunks in the iterator - if let Some(new_ref) = new_ref { - self.manifest_refs.entry(node_id.clone()).or_default().push(new_ref); + let iterators = splits + .iter() + .filter_map(|extent| { + if self.change_set.array_manifest(node_id, extent).is_some() { + let chunks = stream::iter( + self.change_set + .new_array_chunk_iterator(node_id, node_path, extent.clone()) + .map(Ok), + ); + Some(chunks) + } else { + None } - } + }) + .collect::>(); + + let new_refs = + futures::future::join_all(iterators.into_iter().map(|chunks| async { + #[allow(clippy::expect_used)] + write_manifest_from_iterator(self.asset_manager.as_ref(), chunks) + .await + .expect( + "logic bug. for a new node, we must always write the manifest", + ) + })) + .await; + + for (new_ref, file_info) in new_refs.into_iter().flatten() { + self.manifest_refs.entry(node_id.clone()).or_default().push(new_ref); + self.manifest_files.insert(file_info); } + Ok(()) } @@ -1689,9 +1692,6 @@ impl<'a> FlushProcess<'a> { #[allow(clippy::expect_used)] let splits = self.splits.get(&node.id).expect("splits should exist for this node."); - let mut refs = - HashMap::>::with_capacity(splits.len()); - let on_disk_extents = existing_manifests.iter().map(|m| m.extents.clone()).collect::>(); @@ -1704,12 +1704,23 @@ impl<'a> FlushProcess<'a> { // must be a subset of ``splits`` (the splits set in the config) debug_assert!(modified_splits.is_subset(&splits.iter().collect::>())); + let mut tasks = Vec::new(); for extent in splits.iter() { if modified_splits.contains(extent) { - // this split was modified in this session, rewrite it completely - self.write_manifest_for_updated_chunks(node, extent) - .await? - .map(|new_ref| refs.insert(extent.clone(), vec![new_ref])); + let updated_chunks = map_ok_second( + updated_node_chunks_iterator( + self.asset_manager.as_ref(), + self.change_set, + self.parent_id, + node.clone(), + extent.clone(), + ) + .await, + ); + tasks.push(write_manifest_from_iterator( + self.asset_manager.as_ref(), + updated_chunks, + )); } else { // intersection of the current split with extents on disk let on_disk_bbox = on_disk_extents @@ -1726,7 +1737,10 @@ impl<'a> FlushProcess<'a> { Overlap::Complete => { debug_assert!(on_disk_bbox.is_some()); // Just propagate this ref again, no rewriting necessary - refs.entry(extent.clone()).or_default().push(old_ref.clone()); + self.manifest_refs + .entry(node.id.clone()) + .or_default() + .push(old_ref.clone()); // OK to unwrap here since this manifest file must exist in the old snapshot #[allow(clippy::expect_used)] self.manifest_files.insert( @@ -1737,12 +1751,20 @@ impl<'a> FlushProcess<'a> { // the splits have changed, but no refs in this split have been written in this session // same as `if` block above debug_assert!(on_disk_bbox.is_some()); - if let Some(new_ref) = self - .write_manifest_for_updated_chunks(node, extent) - .await? - { - refs.entry(extent.clone()).or_default().push(new_ref); - } + let updated_chunks = map_ok_second( + updated_node_chunks_iterator( + self.asset_manager.as_ref(), + self.change_set, + self.parent_id, + node.clone(), + extent.clone(), + ) + .await, + ); + tasks.push(write_manifest_from_iterator( + self.asset_manager.as_ref(), + updated_chunks, + )); } Overlap::None => { // Nothing to do @@ -1752,12 +1774,15 @@ impl<'a> FlushProcess<'a> { } } + let new_refs = futures::future::join_all(tasks.into_iter()).await; + // FIXME: Assert that bboxes in refs don't overlap - self.manifest_refs - .entry(node.id.clone()) - .or_default() - .extend(refs.into_values().flatten()); + for (new_ref, file_info) in new_refs.into_iter().flatten().flatten() { + self.manifest_refs.entry(node.id.clone()).or_default().push(new_ref); + self.manifest_files.insert(file_info); + } + Ok(()) } From 4e6823197d63268f7447dd9f0743c72331411a36 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sat, 21 Jun 2025 17:38:04 -0600 Subject: [PATCH 41/43] more benchmarks --- icechunk-python/benchmarks/conftest.py | 10 ++-- .../benchmarks/test_benchmark_writes.py | 50 ++++++++++++++++++- 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/icechunk-python/benchmarks/conftest.py b/icechunk-python/benchmarks/conftest.py index 2617b1c20..d45e2b300 100644 --- a/icechunk-python/benchmarks/conftest.py +++ b/icechunk-python/benchmarks/conftest.py @@ -65,11 +65,11 @@ def large_write_dataset(request) -> BenchmarkWriteDataset: @pytest.fixture( params=[ - pytest.param(GB_8MB_CHUNKS, id="gb-8mb"), - pytest.param(GB_128MB_CHUNKS, id="gb-128mb"), - pytest.param(ERA5_SINGLE, id="era5-single"), - pytest.param(ERA5, id="era5-weatherbench"), - pytest.param(ERA5_ARCO, id="era5-arco"), + # pytest.param(GB_8MB_CHUNKS, id="gb-8mb"), + # pytest.param(GB_128MB_CHUNKS, id="gb-128mb"), + # pytest.param(ERA5_SINGLE, id="era5-single"), + # pytest.param(ERA5, id="era5-weatherbench"), + # pytest.param(ERA5_ARCO, id="era5-arco"), pytest.param(LARGE_MANIFEST_UNSHARDED, id="large-manifest-no-split"), pytest.param( LARGE_MANIFEST_SHARDED, diff --git a/icechunk-python/benchmarks/test_benchmark_writes.py b/icechunk-python/benchmarks/test_benchmark_writes.py index 8c82fa4c9..ba0046725 100644 --- a/icechunk-python/benchmarks/test_benchmark_writes.py +++ b/icechunk-python/benchmarks/test_benchmark_writes.py @@ -181,7 +181,7 @@ def write(): @pytest.mark.benchmark(group="refs-write") -def test_write_split_manifest_refs(benchmark, splitting, large_write_dataset) -> None: +def test_write_split_manifest_refs_full_rewrite(benchmark, splitting, large_write_dataset) -> None: dataset = large_write_dataset config = repo_config_with(splitting=splitting) assert config is not None @@ -219,3 +219,51 @@ def commit(session_from_setup): session_from_setup.commit("wrote refs") benchmark.pedantic(commit, setup=write_refs, iterations=1, rounds=10) + + +@pytest.mark.benchmark(group="refs-write") +def test_write_split_manifest_refs_append(benchmark, splitting, large_write_dataset) -> None: + dataset = large_write_dataset + config = repo_config_with(splitting=splitting) + assert config is not None + if hasattr(config.manifest, "splitting"): + assert config.manifest.splitting == splitting + repo = dataset.create(config=config) + session = repo.writable_session(branch="main") + store = session.store + group = zarr.open_group(store, zarr_format=3) + group.create_array( + "array", + shape=dataset.shape, + chunks=dataset.chunks, + dtype="int8", + fill_value=0, + compressors=None, + ) + session.commit("initialize") + + # yuck, but I'm abusing `rounds` to do a loop and time _only_ the commit. + global counter + counter = 0 + rounds = 10 + num_chunks = dataset.shape[0] // dataset.chunks[0] + batch_size = num_chunks // rounds + + def write_refs() -> Session: + global counter + session = repo.writable_session(branch="main") + chunks = [ + VirtualChunkSpec( + index=[i], location=f"s3://foo/bar/{i}.nc", offset=0, length=1 + ) + for i in range(counter * batch_size, counter * batch_size + batch_size) + ] + counter += 1 + session.store.set_virtual_refs("array", chunks) + # (args, kwargs) + return ((session,), {}) + + def commit(session_from_setup): + session_from_setup.commit("wrote refs") + + benchmark.pedantic(commit, setup=write_refs, iterations=1, rounds=rounds) From d0c42becf67817a575000b6db31eeac7fdbeec88 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 23 Jun 2025 11:47:20 -0600 Subject: [PATCH 42/43] Revert "Parallel writes" This reverts commit 3b522597aa9c4ceecc5ae8a3097ebf24064faa41. --- icechunk/src/session.rs | 189 +++++++++++++++++----------------------- 1 file changed, 82 insertions(+), 107 deletions(-) diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index a7e8650c5..1e0777b04 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -12,9 +12,7 @@ use async_stream::try_stream; use bytes::Bytes; use chrono::{DateTime, Utc}; use err_into::ErrorInto; -use futures::{ - FutureExt, Stream, StreamExt, TryStream, TryStreamExt, future::Either, stream, -}; +use futures::{FutureExt, Stream, StreamExt, TryStreamExt, future::Either, stream}; use itertools::{Itertools as _, enumerate, repeat_n}; use regex::bytes::Regex; use serde::{Deserialize, Serialize}; @@ -1574,40 +1572,6 @@ pub fn construct_valid_byte_range( } } -async fn write_manifest_from_iterator( - asset_manager: &AssetManager, - chunks: impl Stream>, -) -> SessionResult> { - let mut from = vec![]; - let mut to = vec![]; - let chunks = aggregate_extents(&mut from, &mut to, chunks, |ci| &ci.coord); - - if let Some(new_manifest) = Manifest::from_stream(chunks) - .await - .map_err(|e| SessionErrorKind::ManifestCreationError(Box::new(e)))? - { - let new_manifest = Arc::new(new_manifest); - let new_manifest_size = - asset_manager.write_manifest(Arc::clone(&new_manifest)).await?; - - let file_info = ManifestFileInfo::new(new_manifest.as_ref(), new_manifest_size); - - let new_ref = ManifestRef { - object_id: new_manifest.id().clone(), - extents: ManifestExtents::new(&from, &to), - }; - Ok(Some((new_ref, file_info))) - } else { - Ok(None) - } -} - -fn map_ok_second( - stream: impl TryStream, Ok = (A, B), Error = E>, -) -> impl TryStream, Ok = B, Error = E> { - stream.map_ok(|(_, second)| second) -} - struct FlushProcess<'a> { asset_manager: Arc, change_set: &'a ChangeSet, @@ -1634,6 +1598,54 @@ impl<'a> FlushProcess<'a> { } } + async fn write_manifest_for_updated_chunks( + &mut self, + node: &NodeSnapshot, + extent: &ManifestExtents, + ) -> SessionResult> { + let asset_manager = Arc::clone(&self.asset_manager); + let updated_chunks = updated_node_chunks_iterator( + asset_manager.as_ref(), + self.change_set, + self.parent_id, + node.clone(), + extent.clone(), + ) + .await + .map_ok(|(_path, chunk_info)| chunk_info); + self.write_manifest_from_iterator(updated_chunks).await + } + + async fn write_manifest_from_iterator( + &mut self, + chunks: impl Stream>, + ) -> SessionResult> { + let mut from = vec![]; + let mut to = vec![]; + let chunks = aggregate_extents(&mut from, &mut to, chunks, |ci| &ci.coord); + + if let Some(new_manifest) = Manifest::from_stream(chunks) + .await + .map_err(|e| SessionErrorKind::ManifestCreationError(Box::new(e)))? + { + let new_manifest = Arc::new(new_manifest); + let new_manifest_size = + self.asset_manager.write_manifest(Arc::clone(&new_manifest)).await?; + + let file_info = + ManifestFileInfo::new(new_manifest.as_ref(), new_manifest_size); + self.manifest_files.insert(file_info); + + let new_ref = ManifestRef { + object_id: new_manifest.id().clone(), + extents: ManifestExtents::new(&from, &to), + }; + Ok(Some(new_ref)) + } else { + Ok(None) + } + } + /// Write a manifest for a node that was created in this session /// It doesn't need to look at previous manifests because the node is new async fn write_manifest_for_new_node( @@ -1645,38 +1657,23 @@ impl<'a> FlushProcess<'a> { let splits = self.splits.get(node_id).expect("getting split for node unexpectedly failed"); - let iterators = splits - .iter() - .filter_map(|extent| { - if self.change_set.array_manifest(node_id, extent).is_some() { - let chunks = stream::iter( - self.change_set - .new_array_chunk_iterator(node_id, node_path, extent.clone()) - .map(Ok), - ); - Some(chunks) - } else { - None - } - }) - .collect::>(); - - let new_refs = - futures::future::join_all(iterators.into_iter().map(|chunks| async { + for extent in splits.iter() { + if self.change_set.array_manifest(node_id, extent).is_some() { + let chunks = stream::iter( + self.change_set + .new_array_chunk_iterator(node_id, node_path, extent.clone()) + .map(Ok), + ); #[allow(clippy::expect_used)] - write_manifest_from_iterator(self.asset_manager.as_ref(), chunks) - .await - .expect( - "logic bug. for a new node, we must always write the manifest", - ) - })) - .await; - - for (new_ref, file_info) in new_refs.into_iter().flatten() { - self.manifest_refs.entry(node_id.clone()).or_default().push(new_ref); - self.manifest_files.insert(file_info); + let new_ref = self.write_manifest_from_iterator(chunks).await.expect( + "logic bug. for a new node, we must always write the manifest", + ); + // new_ref is None if there were no chunks in the iterator + if let Some(new_ref) = new_ref { + self.manifest_refs.entry(node_id.clone()).or_default().push(new_ref); + } + } } - Ok(()) } @@ -1692,6 +1689,9 @@ impl<'a> FlushProcess<'a> { #[allow(clippy::expect_used)] let splits = self.splits.get(&node.id).expect("splits should exist for this node."); + let mut refs = + HashMap::>::with_capacity(splits.len()); + let on_disk_extents = existing_manifests.iter().map(|m| m.extents.clone()).collect::>(); @@ -1704,23 +1704,12 @@ impl<'a> FlushProcess<'a> { // must be a subset of ``splits`` (the splits set in the config) debug_assert!(modified_splits.is_subset(&splits.iter().collect::>())); - let mut tasks = Vec::new(); for extent in splits.iter() { if modified_splits.contains(extent) { - let updated_chunks = map_ok_second( - updated_node_chunks_iterator( - self.asset_manager.as_ref(), - self.change_set, - self.parent_id, - node.clone(), - extent.clone(), - ) - .await, - ); - tasks.push(write_manifest_from_iterator( - self.asset_manager.as_ref(), - updated_chunks, - )); + // this split was modified in this session, rewrite it completely + self.write_manifest_for_updated_chunks(node, extent) + .await? + .map(|new_ref| refs.insert(extent.clone(), vec![new_ref])); } else { // intersection of the current split with extents on disk let on_disk_bbox = on_disk_extents @@ -1737,10 +1726,7 @@ impl<'a> FlushProcess<'a> { Overlap::Complete => { debug_assert!(on_disk_bbox.is_some()); // Just propagate this ref again, no rewriting necessary - self.manifest_refs - .entry(node.id.clone()) - .or_default() - .push(old_ref.clone()); + refs.entry(extent.clone()).or_default().push(old_ref.clone()); // OK to unwrap here since this manifest file must exist in the old snapshot #[allow(clippy::expect_used)] self.manifest_files.insert( @@ -1751,20 +1737,12 @@ impl<'a> FlushProcess<'a> { // the splits have changed, but no refs in this split have been written in this session // same as `if` block above debug_assert!(on_disk_bbox.is_some()); - let updated_chunks = map_ok_second( - updated_node_chunks_iterator( - self.asset_manager.as_ref(), - self.change_set, - self.parent_id, - node.clone(), - extent.clone(), - ) - .await, - ); - tasks.push(write_manifest_from_iterator( - self.asset_manager.as_ref(), - updated_chunks, - )); + if let Some(new_ref) = self + .write_manifest_for_updated_chunks(node, extent) + .await? + { + refs.entry(extent.clone()).or_default().push(new_ref); + } } Overlap::None => { // Nothing to do @@ -1774,15 +1752,12 @@ impl<'a> FlushProcess<'a> { } } - let new_refs = futures::future::join_all(tasks.into_iter()).await; - // FIXME: Assert that bboxes in refs don't overlap - for (new_ref, file_info) in new_refs.into_iter().flatten().flatten() { - self.manifest_refs.entry(node.id.clone()).or_default().push(new_ref); - self.manifest_files.insert(file_info); - } - + self.manifest_refs + .entry(node.id.clone()) + .or_default() + .extend(refs.into_values().flatten()); Ok(()) } From fcdc2d686834279fe587b40231d80b1891b98c43 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 23 Jun 2025 11:59:08 -0600 Subject: [PATCH 43/43] lint --- icechunk-python/benchmarks/test_benchmark_writes.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/icechunk-python/benchmarks/test_benchmark_writes.py b/icechunk-python/benchmarks/test_benchmark_writes.py index ba0046725..efe77e8cc 100644 --- a/icechunk-python/benchmarks/test_benchmark_writes.py +++ b/icechunk-python/benchmarks/test_benchmark_writes.py @@ -181,7 +181,9 @@ def write(): @pytest.mark.benchmark(group="refs-write") -def test_write_split_manifest_refs_full_rewrite(benchmark, splitting, large_write_dataset) -> None: +def test_write_split_manifest_refs_full_rewrite( + benchmark, splitting, large_write_dataset +) -> None: dataset = large_write_dataset config = repo_config_with(splitting=splitting) assert config is not None @@ -222,7 +224,9 @@ def commit(session_from_setup): @pytest.mark.benchmark(group="refs-write") -def test_write_split_manifest_refs_append(benchmark, splitting, large_write_dataset) -> None: +def test_write_split_manifest_refs_append( + benchmark, splitting, large_write_dataset +) -> None: dataset = large_write_dataset config = repo_config_with(splitting=splitting) assert config is not None