Improve performance of size stats

paraseba · paraseba · commit 1c3b67d563b3 · 2025-07-16T19:42:12.000-03:00
This introduces two limits to the concurrency:

* max memory keeping for in flight manifests
* max number of concurrent manifest fetches

With this, we don't need to limit the concurrency to a specific number
of manifests, but now we can fetch more manifests if they are small.

Combining these two limits, we get an important performance optimization,
particularly for repos with a large number of smallish manifests.

I used this script to test and observe the performance of this:


```python
import arraylake
import time
import icechunk

icechunk.set_logs_filter("icechunk::stream_utils=trace")

token = '...'

client = arraylake.Client(token=token)
repo = client.get_repo("earthmover-public/era5-surface-aws")

start_time = time.time()
repo.total_chunks_storage(None, None)
print(" %s seconds" % (time.time() - start_time))
```

At the time of running this, ERA5 took around 4 seconds to compute.

Testing with real world repos with small manifests shows
performance improvements of the order of 30x.
diff --git a/icechunk-python/python/icechunk/_icechunk_python.pyi b/icechunk-python/python/icechunk/_icechunk_python.pyi
@@ -1357,7 +1357,9 @@ class PyRepository:
         self, message: str, *, branch: str, metadata: dict[str, Any] | None = None
     ) -> str: ...
     def total_chunks_storage(
-        self, process_manifests_concurrently: int | None = None
+        self,
+        max_manifest_mem_bytes: int | None = None,
+        max_concurrent_manifest_fetches: int | None = None,
     ) -> int: ...
 
 class PySession:
diff --git a/icechunk-python/python/icechunk/repository.py b/icechunk-python/python/icechunk/repository.py
@@ -729,7 +729,9 @@ def rewrite_manifests(
         )
 
     def total_chunks_storage(
-        self, process_manifests_concurrently: int | None = None
+        self,
+        max_manifest_mem_bytes: int | None = None,
+        max_concurrent_manifest_fetches: int | None = None,
     ) -> int:
         """Calculate the total storage used for chunks, in bytes .
 
@@ -743,8 +745,12 @@ def total_chunks_storage(
 
         Parameters
         ----------
-        process_manifests_concurrently : int | None
-            Process this many manifests concurrently. Defaults to 10.
+        max_manifest_mem_bytes : int | None
+            Don't use more than this memory to store in-flight manifests. Defaults to 512 MB.
+        max_concurrent_manifest_fetches : int | None
+            Don't run more than this many concurrent manifest fetches. Defaults to 500.
         """
 
-        return self._repository.total_chunks_storage(process_manifests_concurrently)
+        return self._repository.total_chunks_storage(
+            max_manifest_mem_bytes, max_concurrent_manifest_fetches
+        )
diff --git a/icechunk-python/src/repository.rs b/icechunk-python/src/repository.rs
@@ -1,7 +1,7 @@
 use std::{
     borrow::Cow,
     collections::{BTreeMap, BTreeSet, HashMap, HashSet},
-    num::NonZeroU16,
+    num::{NonZeroU16, NonZeroUsize},
     sync::Arc,
 };
 
@@ -1053,7 +1053,8 @@ impl PyRepository {
     pub fn total_chunks_storage(
         &self,
         py: Python<'_>,
-        process_manifests_concurrently: Option<NonZeroU16>,
+        max_manifest_mem_bytes: Option<NonZeroUsize>,
+        max_concurrent_manifest_fetches: Option<NonZeroU16>,
     ) -> PyResult<u64> {
         // This function calls block_on, so we need to allow other thread python to make progress
         py.allow_threads(move || {
@@ -1071,8 +1072,12 @@ impl PyRepository {
                         storage.as_ref(),
                         &storage_settings,
                         asset_manager,
-                        process_manifests_concurrently.unwrap_or(
-                            NonZeroU16::try_from(10).unwrap_or(NonZeroU16::MIN),
+                        max_manifest_mem_bytes.unwrap_or(
+                            NonZeroUsize::try_from(512 * 1020 * 1024)
+                                .unwrap_or(NonZeroUsize::MIN),
+                        ),
+                        max_concurrent_manifest_fetches.unwrap_or(
+                            NonZeroU16::try_from(500).unwrap_or(NonZeroU16::MIN),
                         ),
                     )
                     .await
diff --git a/icechunk/src/lib.rs b/icechunk/src/lib.rs
@@ -32,6 +32,7 @@ pub mod storage;
 pub mod store;
 #[cfg(test)]
 pub mod strategies;
+mod stream_utils;
 pub mod virtual_chunks;
 
 pub use config::{ObjectStoreConfig, RepositoryConfig};
diff --git a/icechunk/src/ops/gc.rs b/icechunk/src/ops/gc.rs
@@ -12,7 +12,7 @@ use crate::{
         ChunkId, IcechunkFormatError, IcechunkFormatErrorKind, ManifestId, SnapshotId,
         manifest::ChunkPayload,
     },
-    ops::pointed_snapshot_ids,
+    ops::pointed_snapshots,
     refs::{Ref, RefError, delete_branch, delete_tag, list_refs},
     repository::{RepositoryError, RepositoryErrorKind},
     storage::{self, DeleteObjectsResult, ListInfo},
@@ -174,23 +174,22 @@ pub async fn garbage_collect(
     }
 
     tracing::info!("Finding GC roots");
-    let all_snaps = pointed_snapshot_ids(
+    let all_snaps = pointed_snapshots(
         storage,
         storage_settings,
         Arc::clone(&asset_manager),
         &config.extra_roots,
     )
     .await?;
 
-    // FIXME: add attribute files
     let mut keep_chunks = HashSet::new();
     let mut keep_manifests = HashSet::new();
     let mut keep_snapshots = HashSet::new();
 
     tracing::info!("Calculating retained objects");
     pin!(all_snaps);
-    while let Some(snap_id) = all_snaps.try_next().await? {
-        let snap = asset_manager.fetch_snapshot(&snap_id).await?;
+    while let Some(snap) = all_snaps.try_next().await? {
+        let snap_id = snap.id();
         if config.deletes_snapshots() && keep_snapshots.insert(snap_id.clone()) {
             tracing::trace!("Adding snapshot to keep list: {}", &snap_id);
         }
@@ -240,6 +239,8 @@ pub async fn garbage_collect(
 
     let mut summary = GCSummary::default();
 
+    tracing::info!("Starting deletes");
+
     if config.deletes_snapshots() {
         let res = gc_snapshots(
             asset_manager.as_ref(),
diff --git a/icechunk/src/ops/stats.rs b/icechunk/src/ops/stats.rs
@@ -1,26 +1,30 @@
-use futures::{TryStream, TryStreamExt as _, future::ready, stream};
+use futures::{TryStreamExt, future::ready, stream};
 use std::{
     collections::HashSet,
-    num::NonZeroU16,
+    num::{NonZeroU16, NonZeroUsize},
     sync::{Arc, Mutex},
 };
+use tracing::trace;
 
 use crate::{
     Storage,
     asset_manager::AssetManager,
-    format::{ChunkId, ManifestId, manifest::ChunkPayload},
+    format::{
+        ChunkId,
+        manifest::{ChunkPayload, Manifest},
+        snapshot::ManifestFileInfo,
+    },
     ops::pointed_snapshots,
-    repository::{RepositoryErrorKind, RepositoryResult},
+    repository::{RepositoryError, RepositoryErrorKind, RepositoryResult},
     storage,
+    stream_utils::{StreamLimiter, try_unique_stream},
 };
 
-async fn manifest_chunks_storage(
-    manifest_id: ManifestId,
-    manifest_size: u64,
-    asset_manager: Arc<AssetManager>,
+fn calculate_manifest_storage(
+    manifest: Arc<Manifest>,
     seen_chunks: Arc<Mutex<HashSet<ChunkId>>>,
 ) -> RepositoryResult<u64> {
-    let manifest = asset_manager.fetch_manifest(&manifest_id, manifest_size).await?;
+    trace!(manifest_id = %manifest.id(), "Processing manifest");
     let mut size = 0;
     for payload in manifest.chunk_payloads() {
         match payload {
@@ -47,36 +51,18 @@ async fn manifest_chunks_storage(
             }
         }
     }
+    trace!(manifest_id = %manifest.id(), "Manifest done");
     Ok(size)
 }
 
-pub fn try_unique_stream<S, T, E, F, V>(
-    f: F,
-    stream: S,
-) -> impl TryStream<Ok = T, Error = E>
-where
-    F: Fn(&S::Ok) -> V,
-    S: TryStream<Ok = T, Error = E>,
-    V: Eq + std::hash::Hash,
-{
-    let mut seen = HashSet::new();
-    stream.try_filter(move |item| {
-        let v = f(item);
-        if seen.insert(v) {
-            futures::future::ready(true)
-        } else {
-            futures::future::ready(false)
-        }
-    })
-}
-
 /// Compute the total size in bytes of all committed repo chunks.
 /// It doesn't include inline or virtual chunks.
 pub async fn repo_chunks_storage(
     storage: &(dyn Storage + Send + Sync),
     storage_settings: &storage::Settings,
     asset_manager: Arc<AssetManager>,
-    process_manifests_concurrently: NonZeroU16,
+    max_manifest_mem_bytes: NonZeroUsize,
+    max_concurrent_manifest_fetches: NonZeroU16,
 ) -> RepositoryResult<u64> {
     let extra_roots = Default::default();
     let all_snaps = pointed_snapshots(
@@ -90,27 +76,59 @@ pub async fn repo_chunks_storage(
     let all_manifest_infos = all_snaps
         // this could be slightly optimized by not collecting all manifest info records into a vec
         // but we don't expect too many, and they are small anyway
-        .map_ok(|snap| stream::iter(snap.manifest_files().map(Ok).collect::<Vec<_>>()))
+        .map_ok(|snap| {
+            stream::iter(
+                snap.manifest_files().map(Ok::<_, RepositoryError>).collect::<Vec<_>>(),
+            )
+        })
         .try_flatten();
+
+    // we don't want to check manifests more than once, so we unique them by their id
     let unique_manifest_infos = try_unique_stream(|mi| mi.id.clone(), all_manifest_infos);
 
+    // we want to fetch many manifests in parallel, but not more than memory allows
+    // for this we use the StreamLimiter using the manifest size in bytes for usage
+    let limiter = &Arc::new(StreamLimiter::new(
+        max_manifest_mem_bytes.get(),
+        |m: &ManifestFileInfo| m.size_bytes as usize,
+    ));
+
+    // The StreamLimiter works by calling limit on every element before they are processed
+    let rate_limited_manifests = unique_manifest_infos
+        .and_then(|m| async move { Ok(limiter.clone().limit(m).await) });
+
     let seen_chunks = &Arc::new(Mutex::new(HashSet::new()));
     let asset_manager = &asset_manager;
 
-    let res = unique_manifest_infos
-        .map_ok(|manifest_info| async move {
-            let manifest_size = manifest_info.size_bytes;
-            manifest_chunks_storage(
-                manifest_info.id,
-                manifest_size,
-                Arc::clone(asset_manager),
-                Arc::clone(seen_chunks),
-            )
-            .await
+    let (_, res) = rate_limited_manifests
+        .map_ok(|m| async move {
+            let manifest =
+                Arc::clone(asset_manager).fetch_manifest(&m.id, m.size_bytes).await?;
+            Ok((manifest, m))
+        })
+        // Now we can buffer a bunch of fetch_manifest operations. Because we are using
+        // StreamLimiter we know memory is not going to blow up
+        .try_buffer_unordered(max_concurrent_manifest_fetches.get() as usize)
+        .map_ok(|(manifest, minfo)| async move {
+            let size = calculate_manifest_storage(manifest, Arc::clone(seen_chunks))?;
+            Ok((size, minfo))
+        })
+        // We do some more buffering to get some concurrency on the processing of the manifest file
+        // TODO: this should actually happen in a CPU bounded worker pool
+        .try_buffer_unordered(4)
+        // Now StreamLimiter requires us to call free, this will make room for more manifests to be
+        // fetched into the previous buffer
+        .and_then(|(size, minfo)| async move {
+            limiter.clone().free(minfo).await;
+            Ok(size)
+        })
+        .try_fold((0u64, 0), |(processed, total_size), partial| {
+            //info!("Processed {processed} manifests");
+            ready(Ok((processed + 1, total_size + partial)))
         })
-        .try_buffered(process_manifests_concurrently.get() as usize)
-        .try_fold(0, |total, partial| ready(Ok(total + partial)))
         .await?;
 
+    debug_assert_eq!(limiter.current_usage().await, (0, 0));
+
     Ok(res)
 }
diff --git a/icechunk/src/stream_utils.rs b/icechunk/src/stream_utils.rs
diff --git a/icechunk/tests/test_stats.rs b/icechunk/tests/test_stats.rs