17
17
18
18
//! Table scan api.
19
19
20
- use std:: collections:: HashMap ;
20
+ use std:: collections:: { HashMap , HashSet } ;
21
21
use std:: sync:: { Arc , RwLock } ;
22
22
23
23
use arrow_array:: RecordBatch ;
24
24
use futures:: channel:: mpsc:: { channel, Sender } ;
25
25
use futures:: stream:: BoxStream ;
26
26
use futures:: { SinkExt , StreamExt , TryFutureExt , TryStreamExt } ;
27
+ use itertools:: Itertools ;
27
28
use serde:: { Deserialize , Serialize } ;
28
29
29
30
use crate :: arrow:: ArrowReaderBuilder ;
@@ -36,8 +37,8 @@ use crate::io::object_cache::ObjectCache;
36
37
use crate :: io:: FileIO ;
37
38
use crate :: runtime:: spawn;
38
39
use crate :: spec:: {
39
- DataContentType , DataFileFormat , ManifestEntryRef , ManifestFile , ManifestList , Schema ,
40
- SchemaRef , SnapshotRef , TableMetadataRef ,
40
+ DataContentType , DataFileFormat , ManifestEntryRef , ManifestFile , ManifestList , ManifestStatus ,
41
+ Operation , Schema , SchemaRef , SnapshotRef , TableMetadataRef ,
41
42
} ;
42
43
use crate :: table:: Table ;
43
44
use crate :: utils:: available_parallelism;
@@ -54,6 +55,10 @@ pub struct TableScanBuilder<'a> {
54
55
// Defaults to none which means select all columns
55
56
column_names : Option < Vec < String > > ,
56
57
snapshot_id : Option < i64 > ,
58
+ /// Exclusive. Used for incremental scan.
59
+ from_snapshot_id : Option < i64 > ,
60
+ /// Inclusive. Used for incremental scan.
61
+ to_snapshot_id : Option < i64 > ,
57
62
batch_size : Option < usize > ,
58
63
case_sensitive : bool ,
59
64
filter : Option < Predicate > ,
@@ -72,6 +77,8 @@ impl<'a> TableScanBuilder<'a> {
72
77
table,
73
78
column_names : None ,
74
79
snapshot_id : None ,
80
+ from_snapshot_id : None ,
81
+ to_snapshot_id : None ,
75
82
batch_size : None ,
76
83
case_sensitive : true ,
77
84
filter : None ,
@@ -133,6 +140,18 @@ impl<'a> TableScanBuilder<'a> {
133
140
self
134
141
}
135
142
143
+ /// Set the starting snapshot id (exclusive) for incremental scan.
144
+ pub fn from_snapshot_id ( mut self , from_snapshot_id : i64 ) -> Self {
145
+ self . from_snapshot_id = Some ( from_snapshot_id) ;
146
+ self
147
+ }
148
+
149
+ /// Set the ending snapshot id (inclusive) for incremental scan.
150
+ pub fn to_snapshot_id ( mut self , to_snapshot_id : i64 ) -> Self {
151
+ self . to_snapshot_id = Some ( to_snapshot_id) ;
152
+ self
153
+ }
154
+
136
155
/// Sets the concurrency limit for both manifest files and manifest
137
156
/// entries for this scan
138
157
pub fn with_concurrency_limit ( mut self , limit : usize ) -> Self {
@@ -209,6 +228,8 @@ impl<'a> TableScanBuilder<'a> {
209
228
} ) ?
210
229
. clone ( ) ,
211
230
} ;
231
+ // TODO: we should validate either snapshot (snapshot scan) or
232
+ // from_snapshot_id and to_snapshot_id are set (incremental scan)
212
233
213
234
let schema = snapshot. schema ( self . table . metadata ( ) ) ?;
214
235
@@ -289,6 +310,8 @@ impl<'a> TableScanBuilder<'a> {
289
310
snapshot_bound_predicate : snapshot_bound_predicate. map ( Arc :: new) ,
290
311
object_cache : self . table . object_cache ( ) ,
291
312
field_ids : Arc :: new ( field_ids) ,
313
+ from_snapshot_id : self . from_snapshot_id ,
314
+ to_snapshot_id : self . to_snapshot_id ,
292
315
partition_filter_cache : Arc :: new ( PartitionFilterCache :: new ( ) ) ,
293
316
manifest_evaluator_cache : Arc :: new ( ManifestEvaluatorCache :: new ( ) ) ,
294
317
expression_evaluator_cache : Arc :: new ( ExpressionEvaluatorCache :: new ( ) ) ,
@@ -344,6 +367,8 @@ struct PlanContext {
344
367
snapshot_bound_predicate : Option < Arc < BoundPredicate > > ,
345
368
object_cache : Arc < ObjectCache > ,
346
369
field_ids : Arc < Vec < i32 > > ,
370
+ from_snapshot_id : Option < i64 > ,
371
+ to_snapshot_id : Option < i64 > ,
347
372
348
373
partition_filter_cache : Arc < PartitionFilterCache > ,
349
374
manifest_evaluator_cache : Arc < ManifestEvaluatorCache > ,
@@ -362,6 +387,66 @@ impl TableScan {
362
387
// used to stream the results back to the caller
363
388
let ( file_scan_task_tx, file_scan_task_rx) = channel ( concurrency_limit_manifest_entries) ;
364
389
390
+ if let ( Some ( from_snapshot_id) , Some ( to_snapshot_id) ) = (
391
+ self . plan_context . from_snapshot_id ,
392
+ self . plan_context . to_snapshot_id ,
393
+ ) {
394
+ // Incremental scan mode
395
+ let added_files = added_files_between (
396
+ & self . plan_context . object_cache ,
397
+ & self . plan_context . table_metadata ,
398
+ to_snapshot_id,
399
+ from_snapshot_id,
400
+ )
401
+ . await ?;
402
+
403
+ for entry in added_files {
404
+ let manifest_entry_context = ManifestEntryContext {
405
+ manifest_entry : entry,
406
+ expression_evaluator_cache : self
407
+ . plan_context
408
+ . expression_evaluator_cache
409
+ . clone ( ) ,
410
+ field_ids : self . plan_context . field_ids . clone ( ) ,
411
+ bound_predicates : None , // TODO: support predicates in incremental scan
412
+ partition_spec_id : 0 , // TODO: get correct partition spec id
413
+ // It's used to skip any data file whose partition data indicates that it can't contain
414
+ // any data that matches this scan's filter
415
+ snapshot_schema : self . plan_context . snapshot_schema . clone ( ) ,
416
+ } ;
417
+
418
+ manifest_entry_ctx_tx
419
+ . clone ( )
420
+ . send ( manifest_entry_context)
421
+ . await
422
+ . map_err ( |_| Error :: new ( ErrorKind :: Unexpected , "mpsc channel SendError" ) ) ?;
423
+ }
424
+
425
+ let mut channel_for_manifest_entry_error = file_scan_task_tx. clone ( ) ;
426
+
427
+ // Process the [`ManifestEntry`] stream in parallel
428
+ spawn ( async move {
429
+ let result = manifest_entry_ctx_rx
430
+ . map ( |me_ctx| Ok ( ( me_ctx, file_scan_task_tx. clone ( ) ) ) )
431
+ . try_for_each_concurrent (
432
+ concurrency_limit_manifest_entries,
433
+ |( manifest_entry_context, tx) | async move {
434
+ spawn ( async move {
435
+ Self :: process_manifest_entry ( manifest_entry_context, tx) . await
436
+ } )
437
+ . await
438
+ } ,
439
+ )
440
+ . await ;
441
+
442
+ if let Err ( error) = result {
443
+ let _ = channel_for_manifest_entry_error. send ( Err ( error) ) . await ;
444
+ }
445
+ } ) ;
446
+
447
+ return Ok ( file_scan_task_rx. boxed ( ) ) ;
448
+ }
449
+
365
450
let manifest_list = self . plan_context . get_manifest_list ( ) . await ?;
366
451
367
452
// get the [`ManifestFile`]s from the [`ManifestList`], filtering out
@@ -947,6 +1032,100 @@ impl FileScanTask {
947
1032
}
948
1033
}
949
1034
1035
+ struct Ancestors {
1036
+ next : Option < SnapshotRef > ,
1037
+ get_snapshot : Box < dyn Fn ( i64 ) -> Option < SnapshotRef > + Send > ,
1038
+ }
1039
+
1040
+ impl Iterator for Ancestors {
1041
+ type Item = SnapshotRef ;
1042
+
1043
+ fn next ( & mut self ) -> Option < Self :: Item > {
1044
+ let snapshot = self . next . take ( ) ?;
1045
+ let result = snapshot. clone ( ) ;
1046
+ self . next = snapshot
1047
+ . parent_snapshot_id ( )
1048
+ . and_then ( |id| ( self . get_snapshot ) ( id) ) ;
1049
+ Some ( result)
1050
+ }
1051
+ }
1052
+
1053
+ /// Iterate starting from `snapshot` (inclusive) to the root snapshot.
1054
+ fn ancestors_of (
1055
+ table_metadata : & TableMetadataRef ,
1056
+ snapshot : i64 ,
1057
+ ) -> Box < dyn Iterator < Item = SnapshotRef > + Send > {
1058
+ if let Some ( snapshot) = table_metadata. snapshot_by_id ( snapshot) {
1059
+ let table_metadata = table_metadata. clone ( ) ;
1060
+ Box :: new ( Ancestors {
1061
+ next : Some ( snapshot. clone ( ) ) ,
1062
+ get_snapshot : Box :: new ( move |id| table_metadata. snapshot_by_id ( id) . cloned ( ) ) ,
1063
+ } )
1064
+ } else {
1065
+ Box :: new ( std:: iter:: empty ( ) )
1066
+ }
1067
+ }
1068
+
1069
+ /// Iterate starting from `snapshot` (inclusive) to `oldest_snapshot_id` (exclusive).
1070
+ fn ancestors_between (
1071
+ table_metadata : & TableMetadataRef ,
1072
+ latest_snapshot_id : i64 ,
1073
+ oldest_snapshot_id : i64 ,
1074
+ ) -> Box < dyn Iterator < Item = SnapshotRef > + Send > {
1075
+ if latest_snapshot_id == oldest_snapshot_id {
1076
+ return Box :: new ( std:: iter:: empty ( ) ) ;
1077
+ }
1078
+
1079
+ Box :: new (
1080
+ ancestors_of ( table_metadata, latest_snapshot_id)
1081
+ . take_while ( move |snapshot| snapshot. snapshot_id ( ) != oldest_snapshot_id) ,
1082
+ )
1083
+ }
1084
+
1085
+ /// Get all added files between two snapshots.
1086
+ /// The files in `latest_snapshot_id` (inclusive) but not in `oldest_snapshot_id` (exclusive).
1087
+ async fn added_files_between (
1088
+ object_cache : & ObjectCache ,
1089
+ table_metadata : & TableMetadataRef ,
1090
+ latest_snapshot_id : i64 ,
1091
+ oldest_snapshot_id : i64 ,
1092
+ ) -> Result < Vec < ManifestEntryRef > > {
1093
+ let mut added_files = vec ! [ ] ;
1094
+
1095
+ let append_snapshots =
1096
+ ancestors_between ( table_metadata, latest_snapshot_id, oldest_snapshot_id)
1097
+ . filter ( |snapshot| matches ! ( snapshot. summary( ) . operation, Operation :: Append ) )
1098
+ . collect_vec ( ) ;
1099
+ let snapshot_ids: HashSet < i64 > = append_snapshots
1100
+ . iter ( )
1101
+ . map ( |snapshot| snapshot. snapshot_id ( ) )
1102
+ . collect ( ) ;
1103
+
1104
+ for snapshot in append_snapshots {
1105
+ let manifest_list = object_cache
1106
+ . get_manifest_list ( & snapshot, & table_metadata)
1107
+ . await ?;
1108
+
1109
+ for manifest_file in manifest_list. entries ( ) {
1110
+ if !snapshot_ids. contains ( & manifest_file. added_snapshot_id ) {
1111
+ continue ;
1112
+ }
1113
+ let manifest = object_cache. get_manifest ( & manifest_file) . await ?;
1114
+ let entries = manifest. entries ( ) . into_iter ( ) . cloned ( ) . filter ( |entry| {
1115
+ matches ! ( entry. status( ) , ManifestStatus :: Added )
1116
+ && (
1117
+ // Is it possible that the snapshot id here is not contained?
1118
+ entry. snapshot_id ( ) . is_none ( )
1119
+ || snapshot_ids. contains ( & entry. snapshot_id ( ) . unwrap ( ) )
1120
+ )
1121
+ } ) ;
1122
+ added_files. extend ( entries) ;
1123
+ }
1124
+ }
1125
+
1126
+ Ok ( added_files)
1127
+ }
1128
+
950
1129
#[ cfg( test) ]
951
1130
mod tests {
952
1131
use std:: collections:: HashMap ;
@@ -1709,7 +1888,6 @@ mod tests {
1709
1888
let mut values = vec ! [ 2 ; 512 ] ;
1710
1889
values. append ( vec ! [ 3 ; 200 ] . as_mut ( ) ) ;
1711
1890
values. append ( vec ! [ 4 ; 300 ] . as_mut ( ) ) ;
1712
- values. append ( vec ! [ 5 ; 12 ] . as_mut ( ) ) ;
1713
1891
let expected_y = Arc :: new ( Int64Array :: from_iter_values ( values) ) as ArrayRef ;
1714
1892
assert_eq ! ( col, & expected_y) ;
1715
1893
0 commit comments