Skip to content

Commit e7c7766

Browse files
committed
feat: add commit infos apis to new snapshots
Signed-off-by: Robert Pack <[email protected]>
1 parent 27accd7 commit e7c7766

File tree

5 files changed

+173
-14
lines changed

5 files changed

+173
-14
lines changed

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ debug = "line-tables-only"
3131
# "default-engine",
3232
# "developer-visibility",
3333
# ] }
34-
delta_kernel = { git = "https://github.com/roeap/delta-kernel-rs", rev = "caeb70ab78e4d5f3b56b5105fd3587c1046d1e1b", features = [
34+
delta_kernel = { git = "https://github.com/roeap/delta-kernel-rs", rev = "023abf1ee604b77bbaa5efec97e043fc4bdf220b", features = [
3535
"default-engine",
3636
"developer-visibility",
3737
] }

crates/core/src/kernel/snapshot_next/eager.rs

+16-4
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ use std::sync::Arc;
22

33
use arrow::compute::{concat_batches, filter_record_batch};
44
use arrow_array::{BooleanArray, RecordBatch};
5-
use chrono::format::Item;
65
use delta_kernel::actions::set_transaction::SetTransactionMap;
76
use delta_kernel::actions::{get_log_add_schema, get_log_schema, ADD_NAME, REMOVE_NAME};
87
use delta_kernel::actions::{Add, Metadata, Protocol, SetTransaction};
@@ -11,14 +10,15 @@ use delta_kernel::log_segment::LogSegment;
1110
use delta_kernel::scan::log_replay::scan_action_iter;
1211
use delta_kernel::schema::Schema;
1312
use delta_kernel::table_properties::TableProperties;
14-
use delta_kernel::{EngineData, Expression, Table, Version};
13+
use delta_kernel::{Engine, EngineData, Expression, Table, Version};
1514
use itertools::Itertools;
1615
use object_store::ObjectStore;
1716
use url::Url;
1817

1918
use super::iterators::{AddIterator, AddView, AddViewItem};
2019
use super::lazy::LazySnapshot;
2120
use super::{Snapshot, SnapshotError};
21+
use crate::kernel::CommitInfo;
2222
use crate::{DeltaResult, DeltaTableConfig, DeltaTableError};
2323

2424
/// An eager snapshot of a Delta Table at a specific version.
@@ -77,6 +77,14 @@ impl Snapshot for EagerSnapshot {
7777
) -> DeltaResult<Option<SetTransaction>> {
7878
self.snapshot.application_transaction(app_id)
7979
}
80+
81+
fn commit_infos(
82+
&self,
83+
start_version: impl Into<Option<Version>>,
84+
limit: impl Into<Option<usize>>,
85+
) -> DeltaResult<impl Iterator<Item = (Version, CommitInfo)>> {
86+
self.snapshot.commit_infos(start_version, limit)
87+
}
8088
}
8189

8290
impl EagerSnapshot {
@@ -92,7 +100,7 @@ impl EagerSnapshot {
92100
LazySnapshot::try_new(Table::try_from_uri(table_root)?, store, version).await?;
93101
let files = config
94102
.require_files
95-
.then(|| -> DeltaResult<_> { Ok(replay_file_actions(&snapshot)?) })
103+
.then(|| -> DeltaResult<_> { replay_file_actions(&snapshot) })
96104
.transpose()?;
97105
Ok(Self {
98106
snapshot,
@@ -101,6 +109,10 @@ impl EagerSnapshot {
101109
})
102110
}
103111

112+
pub(crate) fn engine_ref(&self) -> &Arc<dyn Engine> {
113+
self.snapshot.engine_ref()
114+
}
115+
104116
pub fn file_data(&self) -> DeltaResult<&RecordBatch> {
105117
Ok(self
106118
.files
@@ -122,7 +134,7 @@ impl EagerSnapshot {
122134
.files
123135
.as_ref()
124136
.map(|f| f.num_rows())
125-
.ok_or_else(|| SnapshotError::FilesNotInitialized)?)
137+
.ok_or(SnapshotError::FilesNotInitialized)?)
126138
}
127139

128140
pub(crate) fn update(&mut self) -> DeltaResult<()> {

crates/core/src/kernel/snapshot_next/iterators.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ pub struct AddIterator<'a> {
2525
}
2626

2727
impl AddIterator<'_> {
28-
pub fn try_new<'a>(actions: &'a RecordBatch) -> DeltaResult<AddIterator<'a>> {
28+
pub fn try_new(actions: &RecordBatch) -> DeltaResult<AddIterator<'_>> {
2929
validate_column::<StringArray>(actions, &[ADD_NAME, "path"])?;
3030
validate_column::<Int64Array>(actions, &[ADD_NAME, "size"])?;
3131
validate_column::<Int64Array>(actions, &[ADD_NAME, "modificationTime"])?;
@@ -108,7 +108,7 @@ pub struct AddViewItem {
108108
}
109109

110110
impl AddViewItem {
111-
pub fn path<T: Array>(&self) -> &str {
111+
pub fn path(&self) -> &str {
112112
extract_column(&self.actions, &[ADD_NAME, "path"])
113113
.unwrap()
114114
.as_string::<i32>()
@@ -273,7 +273,7 @@ fn validate_column<'a, T: Array + 'static>(
273273
}
274274
} else {
275275
return Err(DeltaTableError::from(
276-
crate::protocol::ProtocolError::InvalidField(format!("Column not found",)),
276+
crate::protocol::ProtocolError::InvalidField("Column not found".to_string()),
277277
));
278278
}
279279
Ok(())

crates/core/src/kernel/snapshot_next/lazy.rs

+58-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
//! Snapshot of a Delta Table at a specific version.
22
//!
3+
use std::io::{BufRead, BufReader, Cursor};
34
use std::sync::{Arc, LazyLock};
45

56
use arrow::compute::filter_record_batch;
@@ -12,6 +13,7 @@ use delta_kernel::engine::default::executor::tokio::{
1213
TokioBackgroundExecutor, TokioMultiThreadExecutor,
1314
};
1415
use delta_kernel::engine::default::DefaultEngine;
16+
use delta_kernel::log_segment::LogSegment;
1517
use delta_kernel::schema::Schema;
1618
use delta_kernel::snapshot::Snapshot as SnapshotInner;
1719
use delta_kernel::table_properties::TableProperties;
@@ -23,6 +25,7 @@ use url::Url;
2325

2426
use super::cache::CommitCacheObjectStore;
2527
use super::Snapshot;
28+
use crate::kernel::{Action, CommitInfo};
2629
use crate::{DeltaResult, DeltaTableError};
2730

2831
// TODO: avoid repetitive parsing of json stats
@@ -35,7 +38,7 @@ pub struct LazySnapshot {
3538

3639
impl Snapshot for LazySnapshot {
3740
fn table_root(&self) -> &Url {
38-
&self.inner.table_root()
41+
self.inner.table_root()
3942
}
4043

4144
fn version(&self) -> Version {
@@ -55,7 +58,7 @@ impl Snapshot for LazySnapshot {
5558
}
5659

5760
fn table_properties(&self) -> &TableProperties {
58-
&self.inner.table_properties()
61+
self.inner.table_properties()
5962
}
6063

6164
fn files(&self) -> DeltaResult<impl Iterator<Item = DeltaResult<RecordBatch>>> {
@@ -96,6 +99,58 @@ impl Snapshot for LazySnapshot {
9699
let scanner = SetTransactionScanner::new(self.inner.clone());
97100
Ok(scanner.application_transaction(self.engine.as_ref(), app_id.as_ref())?)
98101
}
102+
103+
fn commit_infos(
104+
&self,
105+
start_version: impl Into<Option<Version>>,
106+
limit: impl Into<Option<usize>>,
107+
) -> DeltaResult<impl Iterator<Item = (Version, CommitInfo)>> {
108+
// let start_version = start_version.into();
109+
let fs_client = self.engine.get_file_system_client();
110+
let end_version = start_version.into().unwrap_or_else(|| self.version());
111+
let start_version = limit
112+
.into()
113+
.and_then(|limit| {
114+
if limit == 0 {
115+
Some(end_version)
116+
} else {
117+
Some(end_version.saturating_sub(limit as u64 - 1))
118+
}
119+
})
120+
.unwrap_or(0);
121+
let log_root = self.inner.table_root().join("_delta_log").unwrap();
122+
let mut log_segment = LogSegment::for_table_changes(
123+
fs_client.as_ref(),
124+
log_root,
125+
start_version,
126+
end_version,
127+
)?;
128+
log_segment.ascending_commit_files.reverse();
129+
let files = log_segment
130+
.ascending_commit_files
131+
.iter()
132+
.map(|commit_file| (commit_file.location.location.clone(), None))
133+
.collect_vec();
134+
135+
Ok(fs_client
136+
.read_files(files)?
137+
.zip(log_segment.ascending_commit_files.into_iter())
138+
.filter_map(|(data, path)| {
139+
data.ok().and_then(|d| {
140+
let reader = BufReader::new(Cursor::new(d));
141+
for line in reader.lines() {
142+
match line.and_then(|l| Ok(serde_json::from_str::<Action>(&l)?)) {
143+
Ok(Action::CommitInfo(commit_info)) => {
144+
return Some((path.version, commit_info))
145+
}
146+
Err(e) => return None,
147+
_ => continue,
148+
};
149+
}
150+
None
151+
})
152+
}))
153+
}
99154
}
100155

101156
impl LazySnapshot {
@@ -138,7 +193,7 @@ impl LazySnapshot {
138193
}
139194

140195
/// A shared reference to the engine used for interacting with the Delta Table.
141-
pub(super) fn engine_ref(&self) -> &Arc<dyn Engine> {
196+
pub(crate) fn engine_ref(&self) -> &Arc<dyn Engine> {
142197
&self.engine
143198
}
144199

crates/core/src/kernel/snapshot_next/mod.rs

+95-3
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,15 @@ use std::sync::Arc;
44

55
use arrow_array::RecordBatch;
66
use delta_kernel::actions::visitors::SetTransactionMap;
7-
use delta_kernel::actions::{Add, Metadata, Protocol, SetTransaction};
7+
use delta_kernel::actions::{Metadata, Protocol, SetTransaction};
88
use delta_kernel::expressions::{Scalar, StructData};
99
use delta_kernel::schema::Schema;
1010
use delta_kernel::table_properties::TableProperties;
1111
use delta_kernel::Version;
12-
use iterators::{AddIterator, AddView, AddViewItem};
12+
use iterators::{AddView, AddViewItem};
1313
use url::Url;
1414

15+
use crate::kernel::actions::CommitInfo;
1516
use crate::{DeltaResult, DeltaTableError};
1617

1718
pub use eager::EagerSnapshot;
@@ -77,7 +78,7 @@ pub trait Snapshot {
7778
fn files_view(
7879
&self,
7980
) -> DeltaResult<impl Iterator<Item = DeltaResult<impl Iterator<Item = AddViewItem>>>> {
80-
Ok(self.files()?.map(|r| r.and_then(|b| AddView::try_new(b))))
81+
Ok(self.files()?.map(|r| r.and_then(AddView::try_new)))
8182
}
8283

8384
fn tombstones(&self) -> DeltaResult<impl Iterator<Item = DeltaResult<RecordBatch>>>;
@@ -93,10 +94,40 @@ pub trait Snapshot {
9394
///
9495
/// Initiates a log scan, but terminates as soon as the transaction
9596
/// for the given application is found.
97+
///
98+
/// # Parameters
99+
/// - `app_id`: The application id for which to fetch the transaction.
100+
///
101+
/// # Returns
102+
/// The latest transaction for the given application id, if it exists.
96103
fn application_transaction(
97104
&self,
98105
app_id: impl AsRef<str>,
99106
) -> DeltaResult<Option<SetTransaction>>;
107+
108+
/// Get commit info for the table.
109+
///
110+
/// The [`CommitInfo`]s are returned in descending order of version
111+
/// with the most recent commit first starting from the `start_version`.
112+
///
113+
/// [`CommitInfo`]s are read on a best-effort basis. If the action
114+
/// for a version is not available or cannot be parsed, it is skipped.
115+
///
116+
/// # Parameters
117+
/// - `start_version`: The version from which to start fetching commit info.
118+
/// Defaults to the latest version.
119+
/// - `limit`: The maximum number of commit infos to fetch.
120+
///
121+
/// # Returns
122+
/// An iterator of commit info tuples. The first element of the tuple is the version
123+
/// of the commit, the second element is the corresponding commit info.
124+
// TODO(roeap): this is currently using our commit info, we should be using
125+
// the definition form kernel, once handling over there matured.
126+
fn commit_infos(
127+
&self,
128+
start_version: impl Into<Option<Version>>,
129+
limit: impl Into<Option<usize>>,
130+
) -> DeltaResult<impl Iterator<Item = (Version, CommitInfo)>>;
100131
}
101132

102133
impl<T: Snapshot> Snapshot for Arc<T> {
@@ -142,6 +173,67 @@ impl<T: Snapshot> Snapshot for Arc<T> {
142173
) -> DeltaResult<Option<SetTransaction>> {
143174
self.as_ref().application_transaction(app_id)
144175
}
176+
177+
fn commit_infos(
178+
&self,
179+
start_version: impl Into<Option<Version>>,
180+
limit: impl Into<Option<usize>>,
181+
) -> DeltaResult<impl Iterator<Item = (Version, CommitInfo)>> {
182+
self.as_ref().commit_infos(start_version, limit)
183+
}
184+
}
185+
186+
impl<T: Snapshot> Snapshot for Box<T> {
187+
fn table_root(&self) -> &Url {
188+
self.as_ref().table_root()
189+
}
190+
191+
fn version(&self) -> Version {
192+
self.as_ref().version()
193+
}
194+
195+
fn schema(&self) -> &Schema {
196+
self.as_ref().schema()
197+
}
198+
199+
fn metadata(&self) -> &Metadata {
200+
self.as_ref().metadata()
201+
}
202+
203+
fn protocol(&self) -> &Protocol {
204+
self.as_ref().protocol()
205+
}
206+
207+
fn table_properties(&self) -> &TableProperties {
208+
self.as_ref().table_properties()
209+
}
210+
211+
fn files(&self) -> DeltaResult<impl Iterator<Item = DeltaResult<RecordBatch>>> {
212+
self.as_ref().files()
213+
}
214+
215+
fn tombstones(&self) -> DeltaResult<impl Iterator<Item = DeltaResult<RecordBatch>>> {
216+
self.as_ref().tombstones()
217+
}
218+
219+
fn application_transactions(&self) -> DeltaResult<SetTransactionMap> {
220+
self.as_ref().application_transactions()
221+
}
222+
223+
fn application_transaction(
224+
&self,
225+
app_id: impl AsRef<str>,
226+
) -> DeltaResult<Option<SetTransaction>> {
227+
self.as_ref().application_transaction(app_id)
228+
}
229+
230+
fn commit_infos(
231+
&self,
232+
start_version: impl Into<Option<Version>>,
233+
limit: impl Into<Option<usize>>,
234+
) -> DeltaResult<impl Iterator<Item = (Version, CommitInfo)>> {
235+
self.as_ref().commit_infos(start_version, limit)
236+
}
145237
}
146238

147239
#[cfg(test)]

0 commit comments

Comments
 (0)