@@ -47,7 +47,7 @@ use datafusion::execution::context::{SessionConfig, SessionContext, SessionState
47
47
use datafusion:: execution:: runtime_env:: RuntimeEnv ;
48
48
use datafusion:: execution:: FunctionRegistry ;
49
49
use datafusion:: optimizer:: simplify_expressions:: ExprSimplifier ;
50
- use datafusion:: physical_optimizer:: pruning:: PruningPredicate ;
50
+ use datafusion:: physical_optimizer:: pruning:: { PruningPredicate , PruningStatistics } ;
51
51
use datafusion_common:: scalar:: ScalarValue ;
52
52
use datafusion_common:: tree_node:: { TreeNode , TreeNodeRecursion , TreeNodeVisitor } ;
53
53
use datafusion_common:: {
@@ -581,31 +581,64 @@ impl<'a> DeltaScanBuilder<'a> {
581
581
( files, files_scanned, 0 )
582
582
}
583
583
None => {
584
- if let Some ( predicate) = & logical_filter {
585
- let pruning_predicate =
586
- PruningPredicate :: try_new ( predicate. clone ( ) , logical_schema. clone ( ) ) ?;
587
- let files_to_prune = pruning_predicate. prune ( self . snapshot ) ?;
588
- let mut files_pruned = 0usize ;
589
- let files = self
584
+ // early return in case we have no push down filters or limit
585
+ if logical_filter. is_none ( ) && self . limit . is_none ( ) {
586
+ let files = self . snapshot . file_actions ( ) ?;
587
+ let files_scanned = files. len ( ) ;
588
+ ( files, files_scanned, 0 )
589
+ } else {
590
+ let num_containers = self . snapshot . num_containers ( ) ;
591
+
592
+ let files_to_prune = if let Some ( predicate) = & logical_filter {
593
+ let pruning_predicate =
594
+ PruningPredicate :: try_new ( predicate. clone ( ) , logical_schema. clone ( ) ) ?;
595
+ pruning_predicate. prune ( self . snapshot ) ?
596
+ } else {
597
+ vec ! [ true ; num_containers]
598
+ } ;
599
+
600
+ // needed to enforce limit and deal with missing statistics
601
+ // rust port of https://github.com/delta-io/delta/pull/1495
602
+ let mut pruned_without_stats = vec ! [ ] ;
603
+ let mut rows_collected = 0 ;
604
+ let mut files = vec ! [ ] ;
605
+
606
+ for ( action, keep) in self
590
607
. snapshot
591
608
. file_actions_iter ( ) ?
592
609
. zip ( files_to_prune. into_iter ( ) )
593
- . filter_map ( |( action, keep) | {
594
- if keep {
595
- Some ( action. to_owned ( ) )
610
+ {
611
+ // prune file based on predicate pushdown
612
+ if keep {
613
+ // prune file based on limit pushdown
614
+ if let Some ( limit) = self . limit {
615
+ if let Some ( stats) = action. get_stats ( ) ? {
616
+ if rows_collected <= limit as i64 {
617
+ rows_collected += stats. num_records ;
618
+ files. push ( action. to_owned ( ) ) ;
619
+ } else {
620
+ break ;
621
+ }
622
+ } else {
623
+ // some files are missing stats; skipping but storing them
624
+ // in a list in case we can't reach the target limit
625
+ pruned_without_stats. push ( action. to_owned ( ) ) ;
626
+ }
596
627
} else {
597
- files_pruned += 1 ;
598
- None
628
+ files. push ( action. to_owned ( ) ) ;
599
629
}
600
- } )
601
- . collect :: < Vec < _ > > ( ) ;
630
+ }
631
+ }
632
+
633
+ if let Some ( limit) = self . limit {
634
+ if rows_collected < limit as i64 {
635
+ files. extend ( pruned_without_stats) ;
636
+ }
637
+ }
602
638
603
639
let files_scanned = files. len ( ) ;
640
+ let files_pruned = num_containers - files_scanned;
604
641
( files, files_scanned, files_pruned)
605
- } else {
606
- let files = self . snapshot . file_actions ( ) ?;
607
- let files_scanned = files. len ( ) ;
608
- ( files, files_scanned, 0 )
609
642
}
610
643
}
611
644
} ;
0 commit comments