@@ -27,14 +27,17 @@ use std::fmt::{self, Debug};
27
27
use std:: sync:: Arc ;
28
28
29
29
use arrow_array:: types:: UInt16Type ;
30
- use arrow_array:: { Array , DictionaryArray , RecordBatch , StringArray , TypedDictionaryArray } ;
30
+ use arrow_array:: {
31
+ Array , BooleanArray , DictionaryArray , RecordBatch , StringArray , TypedDictionaryArray ,
32
+ } ;
31
33
use arrow_cast:: display:: array_value_to_string;
32
34
use arrow_cast:: { cast_with_options, CastOptions } ;
33
35
use arrow_schema:: {
34
36
ArrowError , DataType as ArrowDataType , Field , Schema as ArrowSchema , SchemaRef ,
35
37
SchemaRef as ArrowSchemaRef , TimeUnit ,
36
38
} ;
37
39
use arrow_select:: concat:: concat_batches;
40
+ use arrow_select:: filter:: filter_record_batch;
38
41
use async_trait:: async_trait;
39
42
use chrono:: { DateTime , TimeZone , Utc } ;
40
43
use datafusion:: catalog:: { Session , TableProviderFactory } ;
@@ -87,7 +90,9 @@ use url::Url;
87
90
use crate :: delta_datafusion:: expr:: parse_predicate_expression;
88
91
use crate :: delta_datafusion:: schema_adapter:: DeltaSchemaAdapterFactory ;
89
92
use crate :: errors:: { DeltaResult , DeltaTableError } ;
90
- use crate :: kernel:: { Add , DataCheck , EagerSnapshot , Invariant , Snapshot , StructTypeExt } ;
93
+ use crate :: kernel:: {
94
+ Add , DataCheck , EagerSnapshot , Invariant , LogDataHandler , Snapshot , StructTypeExt ,
95
+ } ;
91
96
use crate :: logstore:: LogStoreRef ;
92
97
use crate :: table:: builder:: ensure_table_uri;
93
98
use crate :: table:: state:: DeltaTableState ;
@@ -573,6 +578,8 @@ impl<'a> DeltaScanBuilder<'a> {
573
578
. unwrap ( )
574
579
} ) ;
575
580
581
+ let mut pruning_mask: Option < _ > = None ;
582
+
576
583
// Perform Pruning of files to scan
577
584
let ( files, files_scanned, files_pruned) = match self . files {
578
585
Some ( files) => {
@@ -592,7 +599,9 @@ impl<'a> DeltaScanBuilder<'a> {
592
599
let files_to_prune = if let Some ( predicate) = & logical_filter {
593
600
let pruning_predicate =
594
601
PruningPredicate :: try_new ( predicate. clone ( ) , logical_schema. clone ( ) ) ?;
595
- pruning_predicate. prune ( self . snapshot ) ?
602
+ let mask = pruning_predicate. prune ( self . snapshot ) ?;
603
+ pruning_mask = Some ( mask. clone ( ) ) ;
604
+ mask
596
605
} else {
597
606
vec ! [ true ; num_containers]
598
607
} ;
@@ -695,10 +704,18 @@ impl<'a> DeltaScanBuilder<'a> {
695
704
) ) ;
696
705
}
697
706
698
- let stats = self
699
- . snapshot
700
- . datafusion_table_statistics ( )
701
- . unwrap_or ( Statistics :: new_unknown ( & schema) ) ;
707
+ // FIXME - where is the correct place to marry file pruning with statistics pruning?
708
+ // Temporarily re-generating the log handler, just so that we can compute the stats.
709
+ // Should we update datafusion_table_statistics to optionally take the mask?
710
+ let stats = if let Some ( mask) = pruning_mask {
711
+ let es = self . snapshot . snapshot ( ) ;
712
+ let pruned_stats = prune_file_statistics ( & es. files , mask) ;
713
+ LogDataHandler :: new ( & pruned_stats, es. metadata ( ) , es. schema ( ) ) . statistics ( )
714
+ } else {
715
+ self . snapshot . datafusion_table_statistics ( )
716
+ } ;
717
+
718
+ let stats = stats. unwrap_or ( Statistics :: new_unknown ( & schema) ) ;
702
719
703
720
let parquet_options = TableParquetOptions {
704
721
global : self . session . config ( ) . options ( ) . execution . parquet . clone ( ) ,
@@ -756,6 +773,27 @@ impl<'a> DeltaScanBuilder<'a> {
756
773
}
757
774
}
758
775
776
+ fn prune_file_statistics (
777
+ record_batches : & Vec < RecordBatch > ,
778
+ pruning_mask : Vec < bool > ,
779
+ ) -> Vec < RecordBatch > {
780
+ let mut filtered_batches = Vec :: new ( ) ;
781
+ let mut mask_offset = 0 ;
782
+
783
+ for batch in record_batches {
784
+ let num_rows = batch. num_rows ( ) ;
785
+ let batch_mask = & pruning_mask[ mask_offset..mask_offset + num_rows] ;
786
+ mask_offset += num_rows;
787
+
788
+ let boolean_mask = BooleanArray :: from ( batch_mask. to_vec ( ) ) ;
789
+ let filtered_batch =
790
+ filter_record_batch ( batch, & boolean_mask) . expect ( "Failed to filter RecordBatch" ) ;
791
+ filtered_batches. push ( filtered_batch) ;
792
+ }
793
+
794
+ filtered_batches
795
+ }
796
+
759
797
// TODO: implement this for Snapshot, not for DeltaTable
760
798
#[ async_trait]
761
799
impl TableProvider for DeltaTable {
0 commit comments