From 2e1e5fb560a835b877a9999eab8f45f23110c44c Mon Sep 17 00:00:00 2001 From: Sebastian Galkin Date: Thu, 31 Jul 2025 03:47:24 -0300 Subject: [PATCH 1/3] Add `Repository.inspect_snapshot` to inspect snapshot data in JSON Example JSON: ```json { "id": "BYJMXJ1CHYE86YSHESR0", "parent_id": "X8NA4KMB3XYEDM2FAJEG", "flushed_at": "2025-04-15T18:40:41.944920Z", "commit_message": "write data again with more splits", "metadata": {}, "manifests": [ { "id": "19KPV7V2275XRM01YAWG", "size_bytes": 154, "num_chunk_refs": 1 }, { "id": "2PQ1EA39NB2VK7QK688G", "size_bytes": 151, "num_chunk_refs": 1 }, { "id": "7V2NE5Q4H7SN9QF0ZHJ0", "size_bytes": 153, "num_chunk_refs": 1 }, { "id": "7WT30WPG1HF0P2QXGV6G", "size_bytes": 152, "num_chunk_refs": 1 }, { "id": "88Z1G93EH4VF9X28BBHG", "size_bytes": 154, "num_chunk_refs": 1 }, { "id": "B81H9ZEGPV7E15CWXA10", "size_bytes": 153, "num_chunk_refs": 1 }, { "id": "D6QG80A0HJW564ZXRBRG", "size_bytes": 155, "num_chunk_refs": 1 }, { "id": "DDPY0EEPKZXW1ACW41HG", "size_bytes": 152, "num_chunk_refs": 1 }, { "id": "FM7YJ4BBE12X6CE0T4AG", "size_bytes": 152, "num_chunk_refs": 1 }, { "id": "HB9W7VCB4QA786S2XJBG", "size_bytes": 153, "num_chunk_refs": 1 }, { "id": "J3ZNDGTGA4AY29S8SJVG", "size_bytes": 153, "num_chunk_refs": 1 }, { "id": "MZG15YADGEG849HP3DPG", "size_bytes": 152, "num_chunk_refs": 1 }, { "id": "RNXAWAZHFV0CDV8JJVBG", "size_bytes": 155, "num_chunk_refs": 1 }, { "id": "TTSZ8VYDX0Y7PB6A13H0", "size_bytes": 155, "num_chunk_refs": 1 }, { "id": "WS2BZ61Q1V564ZG6GSJG", "size_bytes": 173, "num_chunk_refs": 5 }, { "id": "XXCGBC5VMRPB89PFMFGG", "size_bytes": 153, "num_chunk_refs": 1 }, { "id": "ZEQVP57D4FWAZJ0BJV6G", "size_bytes": 152, "num_chunk_refs": 1 } ], "nodes": [ { "id": "K5M4YV41NR8BJ", "path": "/", "node_type": "group" }, { "id": "B6743MVT85QN0", "path": "/group1", "node_type": "group" }, { "id": "6HM4K1ZW0HFG4", "path": "/group1/small_chunks", "node_type": "array", "manifest_refs": [ { "id": "WS2BZ61Q1V564ZG6GSJG", "extents": [ [ 0, 5 ] ] } ] }, { "id": "TD9T62TWTHR14", "path": "/group1/split", "node_type": "array", "manifest_refs": [ { "id": "DDPY0EEPKZXW1ACW41HG", "extents": [ [ 3, 4 ], [ 3, 4 ] ] }, { "id": "J3ZNDGTGA4AY29S8SJVG", "extents": [ [ 0, 1 ], [ 2, 3 ] ] }, { "id": "HB9W7VCB4QA786S2XJBG", "extents": [ [ 1, 2 ], [ 1, 2 ] ] }, { "id": "FM7YJ4BBE12X6CE0T4AG", "extents": [ [ 1, 2 ], [ 2, 3 ] ] }, { "id": "7WT30WPG1HF0P2QXGV6G", "extents": [ [ 1, 2 ], [ 3, 4 ] ] }, { "id": "TTSZ8VYDX0Y7PB6A13H0", "extents": [ [ 0, 1 ], [ 3, 4 ] ] }, { "id": "D6QG80A0HJW564ZXRBRG", "extents": [ [ 0, 1 ], [ 1, 2 ] ] }, { "id": "XXCGBC5VMRPB89PFMFGG", "extents": [ [ 1, 2 ], [ 0, 1 ] ] }, { "id": "RNXAWAZHFV0CDV8JJVBG", "extents": [ [ 2, 3 ], [ 1, 2 ] ] }, { "id": "7V2NE5Q4H7SN9QF0ZHJ0", "extents": [ [ 0, 1 ], [ 0, 1 ] ] }, { "id": "19KPV7V2275XRM01YAWG", "extents": [ [ 3, 4 ], [ 2, 3 ] ] }, { "id": "2PQ1EA39NB2VK7QK688G", "extents": [ [ 2, 3 ], [ 2, 3 ] ] }, { "id": "B81H9ZEGPV7E15CWXA10", "extents": [ [ 2, 3 ], [ 3, 4 ] ] }, { "id": "ZEQVP57D4FWAZJ0BJV6G", "extents": [ [ 3, 4 ], [ 1, 2 ] ] }, { "id": "MZG15YADGEG849HP3DPG", "extents": [ [ 3, 4 ], [ 0, 1 ] ] }, { "id": "88Z1G93EH4VF9X28BBHG", "extents": [ [ 2, 3 ], [ 0, 1 ] ] } ] } ] } ``` --- .../python/icechunk/_icechunk_python.pyi | 1 + icechunk-python/python/icechunk/repository.py | 3 + icechunk-python/src/repository.rs | 14 ++ icechunk-python/tests/test_inspect.py | 19 +++ icechunk/src/inspect.rs | 157 ++++++++++++++++++ icechunk/src/lib.rs | 1 + 6 files changed, 195 insertions(+) create mode 100644 icechunk-python/tests/test_inspect.py create mode 100644 icechunk/src/inspect.rs diff --git a/icechunk-python/python/icechunk/_icechunk_python.pyi b/icechunk-python/python/icechunk/_icechunk_python.pyi index 97ce72aa1..fc4e2a792 100644 --- a/icechunk-python/python/icechunk/_icechunk_python.pyi +++ b/icechunk-python/python/icechunk/_icechunk_python.pyi @@ -1465,6 +1465,7 @@ class PyRepository: max_compressed_manifest_mem_bytes: int = 512 * 1024 * 1024, max_concurrent_manifest_fetches: int = 500, ) -> int: ... + def inspect_snapshot(self, snapshot_id: str, *, pretty: bool = True) -> str: ... class PySession: @classmethod diff --git a/icechunk-python/python/icechunk/repository.py b/icechunk-python/python/icechunk/repository.py index 1234058fc..e1f0a9897 100644 --- a/icechunk-python/python/icechunk/repository.py +++ b/icechunk-python/python/icechunk/repository.py @@ -1389,3 +1389,6 @@ async def total_chunks_storage_async( max_compressed_manifest_mem_bytes=max_compressed_manifest_mem_bytes, max_concurrent_manifest_fetches=max_concurrent_manifest_fetches, ) + + def inspect_snapshot(self, snapshot_id: str, *, pretty: bool = True) -> str: + return self._repository.inspect_snapshot(snapshot_id, pretty=pretty) diff --git a/icechunk-python/src/repository.rs b/icechunk-python/src/repository.rs index 5c420f2ae..043059791 100644 --- a/icechunk-python/src/repository.rs +++ b/icechunk-python/src/repository.rs @@ -17,6 +17,7 @@ use icechunk::{ snapshot::{ManifestFileInfo, SnapshotInfo, SnapshotProperties}, transaction_log::Diff, }, + inspect::snapshot_json, ops::{ gc::{ExpiredRefAction, GCConfig, GCSummary, expire, garbage_collect}, manifests::rewrite_manifests, @@ -1623,6 +1624,19 @@ impl PyRepository { Ok(result) }) } + + #[pyo3(signature = (snapshot_id, *, pretty = true))] + fn inspect_snapshot(&self, snapshot_id: String, pretty: bool) -> PyResult { + let result = pyo3_async_runtimes::tokio::get_runtime().block_on(async move { + let lock = self.0.read().await; + let snap = snapshot_id.try_into().map_err(PyValueError::new_err)?; + let res = snapshot_json(lock.asset_manager(), &snap, pretty) + .await + .map_err(|e| PyValueError::new_err(e.to_string()))?; + Ok::<_, PyErr>(res) + })?; + Ok(result) + } } fn map_credentials( diff --git a/icechunk-python/tests/test_inspect.py b/icechunk-python/tests/test_inspect.py new file mode 100644 index 000000000..8440b2e34 --- /dev/null +++ b/icechunk-python/tests/test_inspect.py @@ -0,0 +1,19 @@ +import json + +import icechunk as ic + + +def test_inspect_snapshot() -> None: + repo = ic.Repository.open( + storage=ic.local_filesystem_storage("./tests/data/split-repo") + ) + snap = next(repo.ancestry(branch="main")).id + pretty_str = repo.inspect_snapshot(snap, pretty=True) + non_pretty_str = repo.inspect_snapshot(snap, pretty=False) + + pretty = json.loads(pretty_str) + non_pretty = json.loads(non_pretty_str) + + assert pretty["id"] == snap + assert pretty_str != non_pretty_str + assert pretty == non_pretty diff --git a/icechunk/src/inspect.rs b/icechunk/src/inspect.rs new file mode 100644 index 000000000..f89c48fd2 --- /dev/null +++ b/icechunk/src/inspect.rs @@ -0,0 +1,157 @@ +use chrono::{DateTime, Utc}; +use itertools::Itertools; +use serde::{Deserialize, Serialize}; + +use crate::{ + asset_manager::AssetManager, + format::{ + SnapshotId, + manifest::ManifestRef, + snapshot::{ + ManifestFileInfo, NodeData, NodeSnapshot, NodeType, SnapshotProperties, + }, + }, +}; + +#[derive(Debug, Serialize, Deserialize)] +struct ManifestFileInfoInspect { + id: String, + size_bytes: u64, + num_chunk_refs: u32, +} + +impl From for ManifestFileInfoInspect { + fn from(value: ManifestFileInfo) -> Self { + Self { + id: value.id.to_string(), + size_bytes: value.size_bytes, + num_chunk_refs: value.num_chunk_refs, + } + } +} + +#[derive(Debug, Serialize, Deserialize)] +struct ManifestRefInspect { + id: String, + extents: Vec<(u32, u32)>, +} + +impl From for ManifestRefInspect { + fn from(value: ManifestRef) -> Self { + Self { + id: value.object_id.to_string(), + extents: value.extents.iter().map(|r| (r.start, r.end)).collect(), + } + } +} + +#[derive(Debug, Serialize, Deserialize)] +struct NodeSnapshotInspect { + id: String, + path: String, + node_type: String, + #[serde(skip_serializing_if = "Option::is_none")] + manifest_refs: Option>, +} + +impl From for NodeSnapshotInspect { + fn from(value: NodeSnapshot) -> Self { + Self { + id: value.id.to_string(), + path: value.path.to_string(), + node_type: match value.node_type() { + NodeType::Group => "group".to_string(), + NodeType::Array => "array".to_string(), + }, + manifest_refs: match value.node_data { + NodeData::Array { manifests, .. } => { + let ms = manifests.into_iter().map(|m| m.into()).collect(); + Some(ms) + } + NodeData::Group => None, + }, + } + } +} + +#[derive(Debug, Serialize, Deserialize)] +struct SnapshotInfoInspect { + // TODO: add fields + //path: String, + //size_bytes: u64, + id: String, + #[serde(skip_serializing_if = "Option::is_none")] + parent_id: Option, + flushed_at: DateTime, + commit_message: String, + metadata: SnapshotProperties, + + manifests: Vec, + nodes: Vec, +} + +async fn inspect_snapshot( + asset_manager: &AssetManager, + id: &SnapshotId, +) -> Result> { + let snap = asset_manager.fetch_snapshot(id).await?; + let res = SnapshotInfoInspect { + id: snap.id().to_string(), + parent_id: snap.parent_id().map(|p| p.to_string()), + flushed_at: snap.flushed_at()?, + commit_message: snap.message(), + metadata: snap.metadata()?, + manifests: snap.manifest_files().map(|f| f.into()).collect(), + nodes: snap.iter().map_ok(|n| n.into()).try_collect()?, + }; + + Ok(res) +} + +pub async fn snapshot_json( + asset_manager: &AssetManager, + id: &SnapshotId, + pretty: bool, +) -> Result> { + let info = inspect_snapshot(asset_manager, id).await?; + let res = if pretty { + serde_json::to_string_pretty(&info) + } else { + serde_json::to_string(&info) + }?; + Ok(res) +} + +#[cfg(test)] +#[allow(clippy::panic, clippy::unwrap_used, clippy::expect_used)] +mod tests { + use super::*; + use crate::{ObjectStorage, Repository, repository::VersionInfo}; + use futures::{StreamExt, TryStreamExt}; + use std::{path::PathBuf, sync::Arc}; + + #[icechunk_macros::tokio_test] + async fn test_print_snapshot() -> Result<(), Box> { + let st = Arc::new( + ObjectStorage::new_local_filesystem(&PathBuf::from( + "../icechunk-python/tests/data/split-repo", + )) + .await?, + ); + let repo = Repository::open(None, st, Default::default()).await?; + let snap_id = repo + .ancestry(&VersionInfo::BranchTipRef("main".to_string())) + .await? + .boxed() + .try_next() + .await? + .unwrap() + .id; + + let json = snapshot_json(repo.asset_manager(), &snap_id, true).await?; + let info: SnapshotInfoInspect = serde_json::from_str(json.as_str())?; + assert!(info.id == snap_id.to_string()); + + Ok(()) + } +} diff --git a/icechunk/src/lib.rs b/icechunk/src/lib.rs index db82d1210..31aef15f6 100644 --- a/icechunk/src/lib.rs +++ b/icechunk/src/lib.rs @@ -24,6 +24,7 @@ pub mod config; pub mod conflicts; pub mod error; pub mod format; +pub mod inspect; pub mod ops; pub mod refs; pub mod repository; From 487c22c19307385648b1271bd05426305e871f68 Mon Sep 17 00:00:00 2001 From: Sebastian Galkin Date: Thu, 31 Jul 2025 11:17:46 -0300 Subject: [PATCH 2/3] Improve errors --- icechunk-python/src/repository.rs | 17 +++++++++-------- icechunk/src/inspect.rs | 8 +++++--- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/icechunk-python/src/repository.rs b/icechunk-python/src/repository.rs index 043059791..896a3b8a4 100644 --- a/icechunk-python/src/repository.rs +++ b/icechunk-python/src/repository.rs @@ -1627,14 +1627,15 @@ impl PyRepository { #[pyo3(signature = (snapshot_id, *, pretty = true))] fn inspect_snapshot(&self, snapshot_id: String, pretty: bool) -> PyResult { - let result = pyo3_async_runtimes::tokio::get_runtime().block_on(async move { - let lock = self.0.read().await; - let snap = snapshot_id.try_into().map_err(PyValueError::new_err)?; - let res = snapshot_json(lock.asset_manager(), &snap, pretty) - .await - .map_err(|e| PyValueError::new_err(e.to_string()))?; - Ok::<_, PyErr>(res) - })?; + let result = pyo3_async_runtimes::tokio::get_runtime() + .block_on(async move { + let lock = self.0.read().await; + let snap = SnapshotId::try_from(snapshot_id.as_str()) + .map_err(|e| RepositoryErrorKind::Other(e.to_string()))?; + let res = snapshot_json(lock.asset_manager(), &snap, pretty).await?; + Ok(res) + }) + .map_err(PyIcechunkStoreError::RepositoryError)?; Ok(result) } } diff --git a/icechunk/src/inspect.rs b/icechunk/src/inspect.rs index f89c48fd2..1a0c4fb3f 100644 --- a/icechunk/src/inspect.rs +++ b/icechunk/src/inspect.rs @@ -11,6 +11,7 @@ use crate::{ ManifestFileInfo, NodeData, NodeSnapshot, NodeType, SnapshotProperties, }, }, + repository::{RepositoryErrorKind, RepositoryResult}, }; #[derive(Debug, Serialize, Deserialize)] @@ -93,7 +94,7 @@ struct SnapshotInfoInspect { async fn inspect_snapshot( asset_manager: &AssetManager, id: &SnapshotId, -) -> Result> { +) -> RepositoryResult { let snap = asset_manager.fetch_snapshot(id).await?; let res = SnapshotInfoInspect { id: snap.id().to_string(), @@ -112,13 +113,14 @@ pub async fn snapshot_json( asset_manager: &AssetManager, id: &SnapshotId, pretty: bool, -) -> Result> { +) -> RepositoryResult { let info = inspect_snapshot(asset_manager, id).await?; let res = if pretty { serde_json::to_string_pretty(&info) } else { serde_json::to_string(&info) - }?; + } + .map_err(|e| RepositoryErrorKind::Other(e.to_string()))?; Ok(res) } From abee47ca5c633a4e24fe46ae2c9fc0ce9ad867fd Mon Sep 17 00:00:00 2001 From: Sebastian Galkin Date: Thu, 31 Jul 2025 11:28:29 -0300 Subject: [PATCH 3/3] Add async version --- .../python/icechunk/_icechunk_python.pyi | 3 +++ icechunk-python/python/icechunk/repository.py | 5 ++++ icechunk-python/src/repository.rs | 24 ++++++++++++++++++- icechunk-python/tests/test_inspect.py | 16 +++++++++++++ 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/icechunk-python/python/icechunk/_icechunk_python.pyi b/icechunk-python/python/icechunk/_icechunk_python.pyi index fc4e2a792..4b52796c6 100644 --- a/icechunk-python/python/icechunk/_icechunk_python.pyi +++ b/icechunk-python/python/icechunk/_icechunk_python.pyi @@ -1466,6 +1466,9 @@ class PyRepository: max_concurrent_manifest_fetches: int = 500, ) -> int: ... def inspect_snapshot(self, snapshot_id: str, *, pretty: bool = True) -> str: ... + async def inspect_snapshot_async( + self, snapshot_id: str, *, pretty: bool = True + ) -> str: ... class PySession: @classmethod diff --git a/icechunk-python/python/icechunk/repository.py b/icechunk-python/python/icechunk/repository.py index e1f0a9897..58e9a9f62 100644 --- a/icechunk-python/python/icechunk/repository.py +++ b/icechunk-python/python/icechunk/repository.py @@ -1392,3 +1392,8 @@ async def total_chunks_storage_async( def inspect_snapshot(self, snapshot_id: str, *, pretty: bool = True) -> str: return self._repository.inspect_snapshot(snapshot_id, pretty=pretty) + + async def inspect_snapshot_async( + self, snapshot_id: str, *, pretty: bool = True + ) -> str: + return await self._repository.inspect_snapshot_async(snapshot_id, pretty=pretty) diff --git a/icechunk-python/src/repository.rs b/icechunk-python/src/repository.rs index 896a3b8a4..cd9841a5f 100644 --- a/icechunk-python/src/repository.rs +++ b/icechunk-python/src/repository.rs @@ -23,7 +23,7 @@ use icechunk::{ manifests::rewrite_manifests, stats::repo_chunks_storage, }, - repository::{RepositoryErrorKind, VersionInfo}, + repository::{RepositoryError, RepositoryErrorKind, VersionInfo}, }; use pyo3::{ IntoPyObjectExt, @@ -1638,6 +1638,28 @@ impl PyRepository { .map_err(PyIcechunkStoreError::RepositoryError)?; Ok(result) } + + #[pyo3(signature = (snapshot_id, *, pretty = true))] + fn inspect_snapshot_async<'py>( + &self, + py: Python<'py>, + snapshot_id: String, + pretty: bool, + ) -> PyResult> { + let repository = self.0.clone(); + pyo3_async_runtimes::tokio::future_into_py(py, async move { + let lock = repository.read().await; + let snap = SnapshotId::try_from(snapshot_id.as_str()) + .map_err(|e| { + RepositoryError::from(RepositoryErrorKind::Other(e.to_string())) + }) + .map_err(PyIcechunkStoreError::RepositoryError)?; + let res = snapshot_json(lock.asset_manager(), &snap, pretty) + .await + .map_err(PyIcechunkStoreError::RepositoryError)?; + Ok(res) + }) + } } fn map_credentials( diff --git a/icechunk-python/tests/test_inspect.py b/icechunk-python/tests/test_inspect.py index 8440b2e34..583ca11f3 100644 --- a/icechunk-python/tests/test_inspect.py +++ b/icechunk-python/tests/test_inspect.py @@ -17,3 +17,19 @@ def test_inspect_snapshot() -> None: assert pretty["id"] == snap assert pretty_str != non_pretty_str assert pretty == non_pretty + + +async def test_inspect_snapshot_async() -> None: + repo = await ic.Repository.open_async( + storage=ic.local_filesystem_storage("./tests/data/split-repo") + ) + snap = next(repo.ancestry(branch="main")).id + pretty_str = await repo.inspect_snapshot_async(snap, pretty=True) + non_pretty_str = await repo.inspect_snapshot_async(snap, pretty=False) + + pretty = json.loads(pretty_str) + non_pretty = json.loads(non_pretty_str) + + assert pretty["id"] == snap + assert pretty_str != non_pretty_str + assert pretty == non_pretty