@@ -49,7 +49,7 @@ use datafusion::execution::context::{SessionConfig, SessionContext, SessionState
49
49
use datafusion:: execution:: runtime_env:: RuntimeEnv ;
50
50
use datafusion:: execution:: FunctionRegistry ;
51
51
use datafusion:: optimizer:: simplify_expressions:: ExprSimplifier ;
52
- use datafusion:: physical_optimizer:: pruning:: PruningPredicate ;
52
+ use datafusion:: physical_optimizer:: pruning:: { PruningPredicate , PruningStatistics } ;
53
53
use datafusion_common:: scalar:: ScalarValue ;
54
54
use datafusion_common:: tree_node:: { TreeNode , TreeNodeRecursion , TreeNodeVisitor } ;
55
55
use datafusion_common:: {
@@ -570,31 +570,64 @@ impl<'a> DeltaScanBuilder<'a> {
570
570
( files, files_scanned, 0 )
571
571
}
572
572
None => {
573
- if let Some ( predicate) = & logical_filter {
574
- let pruning_predicate =
575
- PruningPredicate :: try_new ( predicate. clone ( ) , logical_schema. clone ( ) ) ?;
576
- let files_to_prune = pruning_predicate. prune ( self . snapshot ) ?;
577
- let mut files_pruned = 0usize ;
578
- let files = self
573
+ // early return in case we have no push down filters or limit
574
+ if logical_filter. is_none ( ) && self . limit . is_none ( ) {
575
+ let files = self . snapshot . file_actions ( ) ?;
576
+ let files_scanned = files. len ( ) ;
577
+ ( files, files_scanned, 0 )
578
+ } else {
579
+ let num_containers = self . snapshot . num_containers ( ) ;
580
+
581
+ let files_to_prune = if let Some ( predicate) = & logical_filter {
582
+ let pruning_predicate =
583
+ PruningPredicate :: try_new ( predicate. clone ( ) , logical_schema. clone ( ) ) ?;
584
+ pruning_predicate. prune ( self . snapshot ) ?
585
+ } else {
586
+ vec ! [ true ; num_containers]
587
+ } ;
588
+
589
+ // needed to enforce limit and deal with missing statistics
590
+ // rust port of https://github.com/delta-io/delta/pull/1495
591
+ let mut pruned_without_stats = vec ! [ ] ;
592
+ let mut rows_collected = 0 ;
593
+ let mut files = vec ! [ ] ;
594
+
595
+ for ( action, keep) in self
579
596
. snapshot
580
597
. file_actions_iter ( ) ?
581
598
. zip ( files_to_prune. into_iter ( ) )
582
- . filter_map ( |( action, keep) | {
583
- if keep {
584
- Some ( action. to_owned ( ) )
599
+ {
600
+ // prune file based on predicate pushdown
601
+ if keep {
602
+ // prune file based on limit pushdown
603
+ if let Some ( limit) = self . limit {
604
+ if let Some ( stats) = action. get_stats ( ) ? {
605
+ if rows_collected <= limit as i64 {
606
+ rows_collected += stats. num_records ;
607
+ files. push ( action. to_owned ( ) ) ;
608
+ } else {
609
+ break ;
610
+ }
611
+ } else {
612
+ // some files are missing stats; skipping but storing them
613
+ // in a list in case we can't reach the target limit
614
+ pruned_without_stats. push ( action. to_owned ( ) ) ;
615
+ }
585
616
} else {
586
- files_pruned += 1 ;
587
- None
617
+ files. push ( action. to_owned ( ) ) ;
588
618
}
589
- } )
590
- . collect :: < Vec < _ > > ( ) ;
619
+ }
620
+ }
621
+
622
+ if let Some ( limit) = self . limit {
623
+ if rows_collected < limit as i64 {
624
+ files. extend ( pruned_without_stats) ;
625
+ }
626
+ }
591
627
592
628
let files_scanned = files. len ( ) ;
629
+ let files_pruned = num_containers - files_scanned;
593
630
( files, files_scanned, files_pruned)
594
- } else {
595
- let files = self . snapshot . file_actions ( ) ?;
596
- let files_scanned = files. len ( ) ;
597
- ( files, files_scanned, 0 )
598
631
}
599
632
}
600
633
} ;
0 commit comments