@@ -25,13 +25,14 @@ use futures::channel::oneshot;
25
25
use futures:: future:: join_all;
26
26
use futures:: { StreamExt , TryStreamExt } ;
27
27
28
+ use crate :: arrow:: record_batch_transformer:: RecordBatchTransformer ;
28
29
use crate :: arrow:: ArrowReader ;
29
30
use crate :: delete_vector:: DeleteVector ;
30
31
use crate :: expr:: Predicate :: AlwaysTrue ;
31
32
use crate :: expr:: { Bind , BoundPredicate , Predicate } ;
32
33
use crate :: io:: FileIO ;
33
34
use crate :: scan:: { ArrowRecordBatchStream , FileScanTask , FileScanTaskDeleteFile } ;
34
- use crate :: spec:: DataContentType ;
35
+ use crate :: spec:: { DataContentType , Schema , SchemaRef } ;
35
36
use crate :: { Error , ErrorKind , Result } ;
36
37
37
38
#[ allow( unused) ]
@@ -164,7 +165,7 @@ impl CachingDeleteFileManager {
164
165
/// * The unbound Predicates resulting from equality deletes are sent to their associated oneshot
165
166
/// channel to store them in the right place in the delete file managers state.
166
167
/// * The results of all of these futures are awaited on in parallel with the specified
167
- /// level of concurrency and collected into a vec. We then combine all of the delete
168
+ /// level of concurrency and collected into a vec. We then combine all the delete
168
169
/// vector maps that resulted from any positional delete or delete vector files into a
169
170
/// single map and persist it in the state.
170
171
///
@@ -206,19 +207,27 @@ impl CachingDeleteFileManager {
206
207
pub ( crate ) async fn load_deletes (
207
208
& self ,
208
209
delete_file_entries : & [ FileScanTaskDeleteFile ] ,
210
+ schema : SchemaRef ,
209
211
) -> Result < ( ) > {
210
212
let stream_items = delete_file_entries
211
213
. iter ( )
212
- . map ( |t| ( t. clone ( ) , self . file_io . clone ( ) , self . state . clone ( ) ) )
214
+ . map ( |t| {
215
+ (
216
+ t. clone ( ) ,
217
+ self . file_io . clone ( ) ,
218
+ self . state . clone ( ) ,
219
+ schema. clone ( ) ,
220
+ )
221
+ } )
213
222
. collect :: < Vec < _ > > ( ) ;
214
223
// NOTE: removing the collect and just passing the iterator to futures::stream:iter
215
224
// results in an error 'implementation of `std::ops::FnOnce` is not general enough'
216
225
217
226
let task_stream = futures:: stream:: iter ( stream_items. into_iter ( ) ) ;
218
227
219
228
let results: Vec < ParsedDeleteFileContext > = task_stream
220
- . map ( move |( task, file_io, state_ref) | async {
221
- Self :: load_file_for_task ( task, file_io, state_ref) . await
229
+ . map ( move |( task, file_io, state_ref, schema ) | async {
230
+ Self :: load_file_for_task ( task, file_io, state_ref, schema ) . await
222
231
} )
223
232
. map ( move |ctx| Ok ( async { Self :: parse_file_content_for_task ( ctx. await ?) . await } ) )
224
233
. try_buffer_unordered ( self . concurrency_limit_data_files )
@@ -248,6 +257,7 @@ impl CachingDeleteFileManager {
248
257
task : FileScanTaskDeleteFile ,
249
258
file_io : FileIO ,
250
259
state : StateRef ,
260
+ schema : SchemaRef ,
251
261
) -> Result < DeleteFileContext > {
252
262
match task. file_type {
253
263
DataContentType :: PositionDeletes => Ok ( DeleteFileContext :: PosDels (
@@ -271,7 +281,11 @@ impl CachingDeleteFileManager {
271
281
} ;
272
282
273
283
Ok ( DeleteFileContext :: FreshEqDel {
274
- batch_stream : Self :: parquet_to_batch_stream ( & task. file_path , file_io) . await ?,
284
+ batch_stream : Self :: evolve_schema (
285
+ Self :: parquet_to_batch_stream ( & task. file_path , file_io) . await ?,
286
+ schema,
287
+ )
288
+ . await ?,
275
289
sender,
276
290
} )
277
291
}
@@ -351,6 +365,30 @@ impl CachingDeleteFileManager {
351
365
Ok ( Box :: pin ( record_batch_stream) as ArrowRecordBatchStream )
352
366
}
353
367
368
+ /// Evolves the schema of the RecordBatches from an equality delete file
369
+ async fn evolve_schema (
370
+ record_batch_stream : ArrowRecordBatchStream ,
371
+ target_schema : Arc < Schema > ,
372
+ ) -> Result < ArrowRecordBatchStream > {
373
+ let eq_ids = target_schema
374
+ . as_ref ( )
375
+ . field_id_to_name_map ( )
376
+ . keys ( )
377
+ . cloned ( )
378
+ . collect :: < Vec < _ > > ( ) ;
379
+
380
+ let mut record_batch_transformer =
381
+ RecordBatchTransformer :: build ( target_schema. clone ( ) , & eq_ids) ;
382
+
383
+ let record_batch_stream = record_batch_stream. map ( move |record_batch| {
384
+ record_batch. and_then ( |record_batch| {
385
+ record_batch_transformer. process_record_batch ( record_batch)
386
+ } )
387
+ } ) ;
388
+
389
+ Ok ( Box :: pin ( record_batch_stream) as ArrowRecordBatchStream )
390
+ }
391
+
354
392
/// Parses a record batch stream coming from positional delete files
355
393
///
356
394
/// Returns a map of data file path to a delete vector
@@ -483,7 +521,7 @@ mod tests {
483
521
let file_scan_tasks = setup ( table_location) ;
484
522
485
523
let result = delete_file_manager
486
- . load_deletes ( & file_scan_tasks[ 0 ] . deletes )
524
+ . load_deletes ( & file_scan_tasks[ 0 ] . deletes , file_scan_tasks [ 0 ] . schema_ref ( ) )
487
525
. await ;
488
526
489
527
assert ! ( result. is_err_and( |e| e. kind( ) == ErrorKind :: FeatureUnsupported ) ) ;
0 commit comments