diff --git a/icechunk-python/python/icechunk/_icechunk_python.pyi b/icechunk-python/python/icechunk/_icechunk_python.pyi index 97ce72aa1..4b52796c6 100644 --- a/icechunk-python/python/icechunk/_icechunk_python.pyi +++ b/icechunk-python/python/icechunk/_icechunk_python.pyi @@ -1465,6 +1465,10 @@ class PyRepository: max_compressed_manifest_mem_bytes: int = 512 * 1024 * 1024, max_concurrent_manifest_fetches: int = 500, ) -> int: ... + def inspect_snapshot(self, snapshot_id: str, *, pretty: bool = True) -> str: ... + async def inspect_snapshot_async( + self, snapshot_id: str, *, pretty: bool = True + ) -> str: ... class PySession: @classmethod diff --git a/icechunk-python/python/icechunk/repository.py b/icechunk-python/python/icechunk/repository.py index 1234058fc..58e9a9f62 100644 --- a/icechunk-python/python/icechunk/repository.py +++ b/icechunk-python/python/icechunk/repository.py @@ -1389,3 +1389,11 @@ async def total_chunks_storage_async( max_compressed_manifest_mem_bytes=max_compressed_manifest_mem_bytes, max_concurrent_manifest_fetches=max_concurrent_manifest_fetches, ) + + def inspect_snapshot(self, snapshot_id: str, *, pretty: bool = True) -> str: + return self._repository.inspect_snapshot(snapshot_id, pretty=pretty) + + async def inspect_snapshot_async( + self, snapshot_id: str, *, pretty: bool = True + ) -> str: + return await self._repository.inspect_snapshot_async(snapshot_id, pretty=pretty) diff --git a/icechunk-python/src/repository.rs b/icechunk-python/src/repository.rs index 5c420f2ae..cd9841a5f 100644 --- a/icechunk-python/src/repository.rs +++ b/icechunk-python/src/repository.rs @@ -17,12 +17,13 @@ use icechunk::{ snapshot::{ManifestFileInfo, SnapshotInfo, SnapshotProperties}, transaction_log::Diff, }, + inspect::snapshot_json, ops::{ gc::{ExpiredRefAction, GCConfig, GCSummary, expire, garbage_collect}, manifests::rewrite_manifests, stats::repo_chunks_storage, }, - repository::{RepositoryErrorKind, VersionInfo}, + repository::{RepositoryError, RepositoryErrorKind, VersionInfo}, }; use pyo3::{ IntoPyObjectExt, @@ -1623,6 +1624,42 @@ impl PyRepository { Ok(result) }) } + + #[pyo3(signature = (snapshot_id, *, pretty = true))] + fn inspect_snapshot(&self, snapshot_id: String, pretty: bool) -> PyResult { + let result = pyo3_async_runtimes::tokio::get_runtime() + .block_on(async move { + let lock = self.0.read().await; + let snap = SnapshotId::try_from(snapshot_id.as_str()) + .map_err(|e| RepositoryErrorKind::Other(e.to_string()))?; + let res = snapshot_json(lock.asset_manager(), &snap, pretty).await?; + Ok(res) + }) + .map_err(PyIcechunkStoreError::RepositoryError)?; + Ok(result) + } + + #[pyo3(signature = (snapshot_id, *, pretty = true))] + fn inspect_snapshot_async<'py>( + &self, + py: Python<'py>, + snapshot_id: String, + pretty: bool, + ) -> PyResult> { + let repository = self.0.clone(); + pyo3_async_runtimes::tokio::future_into_py(py, async move { + let lock = repository.read().await; + let snap = SnapshotId::try_from(snapshot_id.as_str()) + .map_err(|e| { + RepositoryError::from(RepositoryErrorKind::Other(e.to_string())) + }) + .map_err(PyIcechunkStoreError::RepositoryError)?; + let res = snapshot_json(lock.asset_manager(), &snap, pretty) + .await + .map_err(PyIcechunkStoreError::RepositoryError)?; + Ok(res) + }) + } } fn map_credentials( diff --git a/icechunk-python/tests/test_inspect.py b/icechunk-python/tests/test_inspect.py new file mode 100644 index 000000000..583ca11f3 --- /dev/null +++ b/icechunk-python/tests/test_inspect.py @@ -0,0 +1,35 @@ +import json + +import icechunk as ic + + +def test_inspect_snapshot() -> None: + repo = ic.Repository.open( + storage=ic.local_filesystem_storage("./tests/data/split-repo") + ) + snap = next(repo.ancestry(branch="main")).id + pretty_str = repo.inspect_snapshot(snap, pretty=True) + non_pretty_str = repo.inspect_snapshot(snap, pretty=False) + + pretty = json.loads(pretty_str) + non_pretty = json.loads(non_pretty_str) + + assert pretty["id"] == snap + assert pretty_str != non_pretty_str + assert pretty == non_pretty + + +async def test_inspect_snapshot_async() -> None: + repo = await ic.Repository.open_async( + storage=ic.local_filesystem_storage("./tests/data/split-repo") + ) + snap = next(repo.ancestry(branch="main")).id + pretty_str = await repo.inspect_snapshot_async(snap, pretty=True) + non_pretty_str = await repo.inspect_snapshot_async(snap, pretty=False) + + pretty = json.loads(pretty_str) + non_pretty = json.loads(non_pretty_str) + + assert pretty["id"] == snap + assert pretty_str != non_pretty_str + assert pretty == non_pretty diff --git a/icechunk/src/inspect.rs b/icechunk/src/inspect.rs new file mode 100644 index 000000000..1a0c4fb3f --- /dev/null +++ b/icechunk/src/inspect.rs @@ -0,0 +1,159 @@ +use chrono::{DateTime, Utc}; +use itertools::Itertools; +use serde::{Deserialize, Serialize}; + +use crate::{ + asset_manager::AssetManager, + format::{ + SnapshotId, + manifest::ManifestRef, + snapshot::{ + ManifestFileInfo, NodeData, NodeSnapshot, NodeType, SnapshotProperties, + }, + }, + repository::{RepositoryErrorKind, RepositoryResult}, +}; + +#[derive(Debug, Serialize, Deserialize)] +struct ManifestFileInfoInspect { + id: String, + size_bytes: u64, + num_chunk_refs: u32, +} + +impl From for ManifestFileInfoInspect { + fn from(value: ManifestFileInfo) -> Self { + Self { + id: value.id.to_string(), + size_bytes: value.size_bytes, + num_chunk_refs: value.num_chunk_refs, + } + } +} + +#[derive(Debug, Serialize, Deserialize)] +struct ManifestRefInspect { + id: String, + extents: Vec<(u32, u32)>, +} + +impl From for ManifestRefInspect { + fn from(value: ManifestRef) -> Self { + Self { + id: value.object_id.to_string(), + extents: value.extents.iter().map(|r| (r.start, r.end)).collect(), + } + } +} + +#[derive(Debug, Serialize, Deserialize)] +struct NodeSnapshotInspect { + id: String, + path: String, + node_type: String, + #[serde(skip_serializing_if = "Option::is_none")] + manifest_refs: Option>, +} + +impl From for NodeSnapshotInspect { + fn from(value: NodeSnapshot) -> Self { + Self { + id: value.id.to_string(), + path: value.path.to_string(), + node_type: match value.node_type() { + NodeType::Group => "group".to_string(), + NodeType::Array => "array".to_string(), + }, + manifest_refs: match value.node_data { + NodeData::Array { manifests, .. } => { + let ms = manifests.into_iter().map(|m| m.into()).collect(); + Some(ms) + } + NodeData::Group => None, + }, + } + } +} + +#[derive(Debug, Serialize, Deserialize)] +struct SnapshotInfoInspect { + // TODO: add fields + //path: String, + //size_bytes: u64, + id: String, + #[serde(skip_serializing_if = "Option::is_none")] + parent_id: Option, + flushed_at: DateTime, + commit_message: String, + metadata: SnapshotProperties, + + manifests: Vec, + nodes: Vec, +} + +async fn inspect_snapshot( + asset_manager: &AssetManager, + id: &SnapshotId, +) -> RepositoryResult { + let snap = asset_manager.fetch_snapshot(id).await?; + let res = SnapshotInfoInspect { + id: snap.id().to_string(), + parent_id: snap.parent_id().map(|p| p.to_string()), + flushed_at: snap.flushed_at()?, + commit_message: snap.message(), + metadata: snap.metadata()?, + manifests: snap.manifest_files().map(|f| f.into()).collect(), + nodes: snap.iter().map_ok(|n| n.into()).try_collect()?, + }; + + Ok(res) +} + +pub async fn snapshot_json( + asset_manager: &AssetManager, + id: &SnapshotId, + pretty: bool, +) -> RepositoryResult { + let info = inspect_snapshot(asset_manager, id).await?; + let res = if pretty { + serde_json::to_string_pretty(&info) + } else { + serde_json::to_string(&info) + } + .map_err(|e| RepositoryErrorKind::Other(e.to_string()))?; + Ok(res) +} + +#[cfg(test)] +#[allow(clippy::panic, clippy::unwrap_used, clippy::expect_used)] +mod tests { + use super::*; + use crate::{ObjectStorage, Repository, repository::VersionInfo}; + use futures::{StreamExt, TryStreamExt}; + use std::{path::PathBuf, sync::Arc}; + + #[icechunk_macros::tokio_test] + async fn test_print_snapshot() -> Result<(), Box> { + let st = Arc::new( + ObjectStorage::new_local_filesystem(&PathBuf::from( + "../icechunk-python/tests/data/split-repo", + )) + .await?, + ); + let repo = Repository::open(None, st, Default::default()).await?; + let snap_id = repo + .ancestry(&VersionInfo::BranchTipRef("main".to_string())) + .await? + .boxed() + .try_next() + .await? + .unwrap() + .id; + + let json = snapshot_json(repo.asset_manager(), &snap_id, true).await?; + let info: SnapshotInfoInspect = serde_json::from_str(json.as_str())?; + assert!(info.id == snap_id.to_string()); + + Ok(()) + } +} diff --git a/icechunk/src/lib.rs b/icechunk/src/lib.rs index db82d1210..31aef15f6 100644 --- a/icechunk/src/lib.rs +++ b/icechunk/src/lib.rs @@ -24,6 +24,7 @@ pub mod config; pub mod conflicts; pub mod error; pub mod format; +pub mod inspect; pub mod ops; pub mod refs; pub mod repository;