@@ -49,7 +49,7 @@ use datafusion::execution::context::{SessionConfig, SessionContext, SessionState
4949use datafusion:: execution:: runtime_env:: RuntimeEnv ;
5050use datafusion:: execution:: FunctionRegistry ;
5151use datafusion:: optimizer:: simplify_expressions:: ExprSimplifier ;
52- use datafusion:: physical_optimizer:: pruning:: PruningPredicate ;
52+ use datafusion:: physical_optimizer:: pruning:: { PruningPredicate , PruningStatistics } ;
5353use datafusion_common:: scalar:: ScalarValue ;
5454use datafusion_common:: tree_node:: { TreeNode , TreeNodeRecursion , TreeNodeVisitor } ;
5555use datafusion_common:: {
@@ -570,31 +570,64 @@ impl<'a> DeltaScanBuilder<'a> {
570570 ( files, files_scanned, 0 )
571571 }
572572 None => {
573- if let Some ( predicate) = & logical_filter {
574- let pruning_predicate =
575- PruningPredicate :: try_new ( predicate. clone ( ) , logical_schema. clone ( ) ) ?;
576- let files_to_prune = pruning_predicate. prune ( self . snapshot ) ?;
577- let mut files_pruned = 0usize ;
578- let files = self
573+ // early return in case we have no push down filters or limit
574+ if logical_filter. is_none ( ) && self . limit . is_none ( ) {
575+ let files = self . snapshot . file_actions ( ) ?;
576+ let files_scanned = files. len ( ) ;
577+ ( files, files_scanned, 0 )
578+ } else {
579+ let num_containers = self . snapshot . num_containers ( ) ;
580+
581+ let files_to_prune = if let Some ( predicate) = & logical_filter {
582+ let pruning_predicate =
583+ PruningPredicate :: try_new ( predicate. clone ( ) , logical_schema. clone ( ) ) ?;
584+ pruning_predicate. prune ( self . snapshot ) ?
585+ } else {
586+ vec ! [ true ; num_containers]
587+ } ;
588+
589+ // needed to enforce limit and deal with missing statistics
590+ // rust port of https://github.com/delta-io/delta/pull/1495
591+ let mut pruned_without_stats = vec ! [ ] ;
592+ let mut rows_collected = 0 ;
593+ let mut files = vec ! [ ] ;
594+
595+ for ( action, keep) in self
579596 . snapshot
580597 . file_actions_iter ( ) ?
581598 . zip ( files_to_prune. into_iter ( ) )
582- . filter_map ( |( action, keep) | {
583- if keep {
584- Some ( action. to_owned ( ) )
599+ {
600+ // prune file based on predicate pushdown
601+ if keep {
602+ // prune file based on limit pushdown
603+ if let Some ( limit) = self . limit {
604+ if let Some ( stats) = action. get_stats ( ) ? {
605+ if rows_collected <= limit as i64 {
606+ rows_collected += stats. num_records ;
607+ files. push ( action. to_owned ( ) ) ;
608+ } else {
609+ break ;
610+ }
611+ } else {
612+ // some files are missing stats; skipping but storing them
613+ // in a list in case we can't reach the target limit
614+ pruned_without_stats. push ( action. to_owned ( ) ) ;
615+ }
585616 } else {
586- files_pruned += 1 ;
587- None
617+ files. push ( action. to_owned ( ) ) ;
588618 }
589- } )
590- . collect :: < Vec < _ > > ( ) ;
619+ }
620+ }
621+
622+ if let Some ( limit) = self . limit {
623+ if rows_collected < limit as i64 {
624+ files. extend ( pruned_without_stats) ;
625+ }
626+ }
591627
592628 let files_scanned = files. len ( ) ;
629+ let files_pruned = num_containers - files_scanned;
593630 ( files, files_scanned, files_pruned)
594- } else {
595- let files = self . snapshot . file_actions ( ) ?;
596- let files_scanned = files. len ( ) ;
597- ( files, files_scanned, 0 )
598631 }
599632 }
600633 } ;
0 commit comments