|
1 | 1 | use std::any::Any;
|
2 |
| -use std::fmt::Formatter; |
3 | 2 | use std::sync::Arc;
|
4 | 3 |
|
5 |
| -use arrow_schema::SchemaRef; |
6 |
| -use datafusion::execution::{SendableRecordBatchStream, TaskContext}; |
7 |
| -use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; |
| 4 | +use arrow_schema::{Schema, SchemaRef}; |
| 5 | +use async_trait::async_trait; |
| 6 | +use datafusion::catalog::Session; |
| 7 | +use datafusion::catalog::TableProvider; |
| 8 | +use datafusion::execution::SessionState; |
| 9 | +use datafusion_common::{exec_datafusion_err, Column, DFSchema, Result as DataFusionResult}; |
| 10 | +use datafusion_expr::utils::conjunction; |
| 11 | +use datafusion_expr::{Expr, TableProviderFilterPushDown, TableType}; |
| 12 | +use datafusion_physical_expr::PhysicalExpr; |
| 13 | +use datafusion_physical_plan::filter::FilterExec; |
| 14 | +use datafusion_physical_plan::limit::GlobalLimitExec; |
| 15 | +use datafusion_physical_plan::projection::ProjectionExec; |
| 16 | +use datafusion_physical_plan::ExecutionPlan; |
8 | 17 |
|
9 |
| -/// Physical execution of a scan |
10 |
| -#[derive(Debug, Clone)] |
11 |
| -pub struct DeltaCdfScan { |
12 |
| - plan: Arc<dyn ExecutionPlan>, |
13 |
| -} |
| 18 | +use crate::DeltaTableError; |
| 19 | +use crate::{ |
| 20 | + delta_datafusion::DataFusionMixins, operations::load_cdf::CdfLoadBuilder, DeltaResult, |
| 21 | +}; |
14 | 22 |
|
15 |
| -impl DeltaCdfScan { |
16 |
| - /// Creates a new scan |
17 |
| - pub fn new(plan: Arc<dyn ExecutionPlan>) -> Self { |
18 |
| - Self { plan } |
19 |
| - } |
| 23 | +use super::ADD_PARTITION_SCHEMA; |
| 24 | + |
| 25 | +fn session_state_from_session(session: &dyn Session) -> DataFusionResult<&SessionState> { |
| 26 | + session |
| 27 | + .as_any() |
| 28 | + .downcast_ref::<SessionState>() |
| 29 | + .ok_or_else(|| exec_datafusion_err!("Failed to downcast Session to SessionState")) |
20 | 30 | }
|
21 | 31 |
|
22 |
| -impl DisplayAs for DeltaCdfScan { |
23 |
| - fn fmt_as(&self, _t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result { |
24 |
| - write!(f, "{:?}", self) |
25 |
| - } |
| 32 | +#[derive(Debug)] |
| 33 | +pub struct DeltaCdfTableProvider { |
| 34 | + cdf_builder: CdfLoadBuilder, |
| 35 | + schema: SchemaRef, |
26 | 36 | }
|
27 | 37 |
|
28 |
| -impl ExecutionPlan for DeltaCdfScan { |
29 |
| - fn name(&self) -> &str { |
30 |
| - Self::static_name() |
| 38 | +impl DeltaCdfTableProvider { |
| 39 | + /// Build a DeltaCDFTableProvider |
| 40 | + pub fn try_new(cdf_builder: CdfLoadBuilder) -> DeltaResult<Self> { |
| 41 | + let mut fields = cdf_builder.snapshot.input_schema()?.fields().to_vec(); |
| 42 | + for f in ADD_PARTITION_SCHEMA.clone() { |
| 43 | + fields.push(f.into()); |
| 44 | + } |
| 45 | + Ok(DeltaCdfTableProvider { |
| 46 | + cdf_builder, |
| 47 | + schema: Schema::new(fields).into(), |
| 48 | + }) |
31 | 49 | }
|
| 50 | +} |
32 | 51 |
|
| 52 | +#[async_trait] |
| 53 | +impl TableProvider for DeltaCdfTableProvider { |
33 | 54 | fn as_any(&self) -> &dyn Any {
|
34 | 55 | self
|
35 | 56 | }
|
36 | 57 |
|
37 | 58 | fn schema(&self) -> SchemaRef {
|
38 |
| - self.plan.schema().clone() |
| 59 | + self.schema.clone() |
39 | 60 | }
|
40 | 61 |
|
41 |
| - fn properties(&self) -> &datafusion::physical_plan::PlanProperties { |
42 |
| - self.plan.properties() |
| 62 | + fn table_type(&self) -> TableType { |
| 63 | + TableType::Base |
43 | 64 | }
|
44 | 65 |
|
45 |
| - fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> { |
46 |
| - vec![] |
47 |
| - } |
| 66 | + async fn scan( |
| 67 | + &self, |
| 68 | + session: &dyn Session, |
| 69 | + projection: Option<&Vec<usize>>, |
| 70 | + filters: &[Expr], |
| 71 | + limit: Option<usize>, |
| 72 | + ) -> DataFusionResult<Arc<dyn ExecutionPlan>> { |
| 73 | + let session_state = session_state_from_session(session)?; |
| 74 | + let mut plan = self.cdf_builder.build(session_state).await?; |
| 75 | + |
| 76 | + let df_schema: DFSchema = plan.schema().try_into()?; |
| 77 | + |
| 78 | + if let Some(filter_expr) = conjunction(filters.iter().cloned()) { |
| 79 | + let physical_expr = session.create_physical_expr(filter_expr, &df_schema)?; |
| 80 | + plan = Arc::new(FilterExec::try_new(physical_expr, plan)?); |
| 81 | + } |
| 82 | + |
| 83 | + if let Some(projection) = projection { |
| 84 | + let current_projection = (0..plan.schema().fields().len()).collect::<Vec<usize>>(); |
| 85 | + if projection != ¤t_projection { |
| 86 | + let fields: DeltaResult<Vec<(Arc<dyn PhysicalExpr>, String)>> = projection |
| 87 | + .iter() |
| 88 | + .map(|i| { |
| 89 | + let (table_ref, field) = df_schema.qualified_field(*i); |
| 90 | + session |
| 91 | + .create_physical_expr( |
| 92 | + Expr::Column(Column::from((table_ref, field))), |
| 93 | + &df_schema, |
| 94 | + ) |
| 95 | + .map(|expr| (expr, field.name().clone())) |
| 96 | + .map_err(DeltaTableError::from) |
| 97 | + }) |
| 98 | + .collect(); |
| 99 | + let fields = fields?; |
| 100 | + plan = Arc::new(ProjectionExec::try_new(fields, plan)?); |
| 101 | + } |
| 102 | + } |
48 | 103 |
|
49 |
| - fn with_new_children( |
50 |
| - self: Arc<Self>, |
51 |
| - _children: Vec<Arc<dyn ExecutionPlan>>, |
52 |
| - ) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> { |
53 |
| - self.plan.clone().with_new_children(_children) |
| 104 | + if let Some(limit) = limit { |
| 105 | + plan = Arc::new(GlobalLimitExec::new(plan, 0, Some(limit))) |
| 106 | + }; |
| 107 | + Ok(plan) |
54 | 108 | }
|
55 | 109 |
|
56 |
| - fn execute( |
| 110 | + fn supports_filters_pushdown( |
57 | 111 | &self,
|
58 |
| - partition: usize, |
59 |
| - context: Arc<TaskContext>, |
60 |
| - ) -> datafusion_common::Result<SendableRecordBatchStream> { |
61 |
| - self.plan.execute(partition, context) |
| 112 | + filter: &[&Expr], |
| 113 | + ) -> DataFusionResult<Vec<TableProviderFilterPushDown>> { |
| 114 | + Ok(filter |
| 115 | + .iter() |
| 116 | + .map(|_| TableProviderFilterPushDown::Exact) // maybe exact |
| 117 | + .collect()) |
62 | 118 | }
|
63 | 119 | }
|
0 commit comments