Review feedback

dcherian · dcherian · commit d414d4b408dd · 2025-06-26T13:45:46.000-06:00
diff --git a/docs/docs/icechunk-python/performance.md b/docs/docs/icechunk-python/performance.md
@@ -16,9 +16,9 @@ Coming Soon.
 ## Splitting manifests
 
 Icechunk stores chunk references in a chunk manifest file stored in `manifests/`.
-For very large arrays (millions of chunks), these files can get quite large.
 By default, Icechunk stores all chunk references in a single manifest file per array.
-Requesting even a single chunk requires downloading the entire manifest.
+For very large arrays (millions of chunks), these files can get quite large.
+Requesting even a single chunk will require downloading the entire manifest.
 In some cases, this can result in a slow time-to-first-byte or large memory usage.
 Similarly, appending a small amount of data to a large array requires
 downloading and rewriting the entire manifest.
@@ -47,7 +47,7 @@ repo_config = ic.RepositoryConfig(
 )
 ```
 
-Then pass the config to `Repository.open` or `Repository.create`
+Then pass the `config` to `Repository.open` or `Repository.create`
 ```python
 repo = ic.Repository.open(..., config=repo_config)
 ```
@@ -206,7 +206,13 @@ This ends up rewriting all refs to two new manifests.
 
 ### Rewriting manifests
 
-To force Icechunk to rewrite all chunk refs to the current splitting configuration use [`rewrite_manifests`](./reference.md#icechunk.Repository.rewrite_manifests) --- for the current example this will consolidate to two manifests.
+Remember, by default Icechunk only writes one manifest per array regardless of size.
+For large enough arrays, you might see a relative performance hit while committing a new update (e.g. an append),
+or when reading from a Repository object that was just created.
+At that point, you will want to experiment with different manifest split configurations.
+
+To force Icechunk to rewrite all chunk refs to the current splitting configuration use [`rewrite_manifests`](./reference.md#icechunk.Repository.rewrite_manifests)
+--- for the current example this will consolidate to two manifests.
 
 To illustrate, we will use a split size of 3.
 ```python exec="on" session="perf" source="material-block"
@@ -219,7 +225,7 @@ repo_config = ic.RepositoryConfig(
 new_repo = ic.Repository.open(storage, config=repo_config)
 
 snap4 = new_repo.rewrite_manifests(
-    f"rewrite_manifests with new config {str(split_config.to_dict())!r}", branch="main"
+    f"rewrite_manifests with new config", branch="main"
 )
 ```
 
@@ -228,6 +234,57 @@ snap4 = new_repo.rewrite_manifests(
 print(repo.lookup_snapshot(snap4).manifests)
 ```
 
+The splitting configuration is saved in the snapshot metadata.
+```python exec="on" session="perf" source="material-block"
+print(repo.lookup_snapshot(snap4).metadata)
+```
+
 !!! important
 
     Once you find a splitting configuration you like, remember to persist it on-disk using `repo.save_config`.
+
+
+### Example workflow
+
+Here is an example workflow for experimenting with splitting
+
+```python exec="on" session="perf" source="material-block"
+# first define a new config
+split_config = ManifestSplittingConfig.from_dict(
+    {ManifestSplitCondition.AnyArray(): {ManifestSplitDimCondition.Any(): 5}}
+)
+repo_config = ic.RepositoryConfig(
+    manifest=ic.ManifestConfig(splitting=split_config),
+)
+# open the repo with the new config.
+repo = ic.Repository.open(storage, config=repo_config)
+```
+
+We will rewrite the manifests on a different branch
+```python exec="on" session="perf" source="material-block"
+repo.create_branch("split-experiment-1")
+snap = repo.rewrite_manifests(
+    f"rewrite_manifests with new config", branch="split-experiment-1"
+)
+```
+Now benchmark reads on `main` vs `split-experiment-1`
+```python exec="on" session="perf" source="material-block"
+store = repo.readonly_session("main").store
+store_split = repo.readonly_session("split-experiment-1").store
+# ...
+```
+Assume we decided the configuration on `split-experiment-1` was good.
+First we persist that configuration to disk
+```python exec="on" session="perf" source="material-block"
+repo.save_config()
+```
+
+Now point main to the commit with rewritten manifests
+```python exec="on" session="perf" source="material-block"
+repo.reset_branch("main", repo.lookup_branch("split-experiment-1"))
+```
+
+Notice that the persisted config is restored when opening a Repository
+```python exec="on" session="perf" source="material-block"
+print(ic.Repository.open(storage).config.manifest)
+```
diff --git a/icechunk-python/python/icechunk/repository.py b/icechunk-python/python/icechunk/repository.py
@@ -638,7 +638,14 @@ def rewrite_manifests(
         self, message: str, *, branch: str, metadata: dict[str, Any] | None = None
     ) -> str:
         """
-        Rewrite manifests for all arrays and commit to the specified branch.
+        Rewrite manifests for all arrays.
+
+        This method will start a new writable session on the specified branch,
+        rewrite manifests for all arrays, and then commits with the specifeid ``messsage``
+        and ``metadata``.
+
+        A JSON representation of the currently active splitting configuration will be
+        stored in the commit's metadata under the key `"splitting_config"`.
 
         Parameters
         ----------
diff --git a/icechunk-python/src/repository.rs b/icechunk-python/src/repository.rs
@@ -1,7 +1,6 @@
 use std::{
     borrow::Cow,
     collections::{BTreeMap, BTreeSet, HashMap, HashSet},
-    ops::Deref,
     sync::Arc,
 };
 
@@ -941,7 +940,7 @@ impl PyRepository {
             let result =
                 pyo3_async_runtimes::tokio::get_runtime().block_on(async move {
                     let lock = self.0.read().await;
-                    rewrite_manifests(lock.deref(), branch, message, metadata)
+                    rewrite_manifests(&lock, branch, message, metadata)
                         .await
                         .map_err(PyIcechunkStoreError::ManifestOpsError)
                 })?;
diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs
@@ -1,13 +1,3 @@
-use std::{
-    cmp::min,
-    collections::{HashMap, HashSet},
-    convert::Infallible,
-    future::{Future, ready},
-    ops::Range,
-    pin::Pin,
-    sync::Arc,
-};
-
 use async_stream::try_stream;
 use bytes::Bytes;
 use chrono::{DateTime, Utc};
@@ -16,6 +6,16 @@ use futures::{FutureExt, Stream, StreamExt, TryStreamExt, future::Either, stream
 use itertools::{Itertools as _, enumerate, repeat_n};
 use regex::bytes::Regex;
 use serde::{Deserialize, Serialize};
+use serde_json::Value;
+use std::{
+    cmp::min,
+    collections::{BTreeMap, HashMap, HashSet},
+    convert::Infallible,
+    future::{Future, ready},
+    ops::Range,
+    pin::Pin,
+    sync::Arc,
+};
 use thiserror::Error;
 use tokio::task::JoinError;
 use tracing::{Instrument, debug, info, instrument, trace, warn};
@@ -95,6 +95,8 @@ pub enum SessionErrorKind {
     Conflict { expected_parent: Option<SnapshotId>, actual_parent: Option<SnapshotId> },
     #[error("cannot rebase snapshot {snapshot} on top of the branch")]
     RebaseFailed { snapshot: SnapshotId, conflicts: Vec<Conflict> },
+    #[error("error in serializing config to JSON")]
+    JsonSerializationError(#[from] serde_json::Error),
     #[error("error in session serialization")]
     SerializationError(#[from] rmp_serde::encode::Error),
     #[error("error in session deserialization")]
@@ -954,6 +956,8 @@ impl Session {
         properties: Option<SnapshotProperties>,
     ) -> SessionResult<SnapshotId> {
         let nodes = self.list_nodes().await?.collect::<Vec<_>>();
+        // We need to populate the `splits` before calling `commit`.
+        // In the normal chunk setting workflow, that would've been done by `set_chunk_ref`
         for node in nodes.into_iter().flatten() {
             if let NodeSnapshot {
                 id,
@@ -965,7 +969,13 @@ impl Session {
                 self.get_splits(&id, &path, &shape, &dimension_names);
             }
         }
-        self._commit(message, properties, true).await
+
+        let splitting_config_serialized =
+            serde_json::to_value(self.config.manifest().splitting())?;
+        let mut properties =
+            properties.unwrap_or_else(|| BTreeMap::<String, Value>::new());
+        properties.insert("splitting_config".to_string(), splitting_config_serialized);
+        self._commit(message, Some(properties), true).await
     }
 
     #[instrument(skip(self, properties))]