-
Couldn't load subscription status.
- Fork 537
feat: add framework for File Format Options #3794
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
0aeb7ea
c081278
9059d8e
a41d3d3
cd88ef8
e5915e0
56ec968
1b7d6c2
4a89682
56a0cb5
73cccbe
5359a9e
2d29203
b9376bd
4ae8443
c7281de
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ use crate::delta_datafusion::{ | |
| use crate::errors::{DeltaResult, DeltaTableError}; | ||
| use crate::kernel::{Add, EagerSnapshot}; | ||
| use crate::logstore::LogStoreRef; | ||
| use crate::table::file_format_options::{to_table_parquet_options_from_ffo, FileFormatRef}; | ||
|
|
||
| #[derive(Debug, Hash, Eq, PartialEq)] | ||
| /// Representing the result of the [find_files] function. | ||
|
|
@@ -45,6 +46,7 @@ pub(crate) async fn find_files( | |
| snapshot: &EagerSnapshot, | ||
| log_store: LogStoreRef, | ||
| session: &dyn Session, | ||
| file_format_options: Option<&FileFormatRef>, | ||
| predicate: Option<Expr>, | ||
| ) -> DeltaResult<FindFiles> { | ||
| let current_metadata = snapshot.metadata(); | ||
|
|
@@ -71,8 +73,14 @@ pub(crate) async fn find_files( | |
| Span::current().record("candidate_count", result.candidates.len()); | ||
| Ok(result) | ||
| } else { | ||
| let candidates = | ||
| find_files_scan(snapshot, log_store, session, predicate.to_owned()).await?; | ||
| let candidates = find_files_scan( | ||
| snapshot, | ||
| log_store, | ||
| session, | ||
| file_format_options, | ||
| predicate.to_owned(), | ||
| ) | ||
| .await?; | ||
|
|
||
| let result = FindFiles { | ||
| candidates, | ||
|
|
@@ -221,6 +229,7 @@ async fn find_files_scan( | |
| snapshot: &EagerSnapshot, | ||
| log_store: LogStoreRef, | ||
| session: &dyn Session, | ||
| file_format_options: Option<&FileFormatRef>, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here it's already available in the snapshot. so we can defer at latest stage to grab these from the load_config |
||
| expression: Expr, | ||
| ) -> DeltaResult<Vec<Add>> { | ||
| let candidate_map: HashMap<String, Add> = snapshot | ||
|
|
@@ -250,10 +259,13 @@ async fn find_files_scan( | |
| // Add path column | ||
| used_columns.push(logical_schema.index_of(scan_config.file_column_name.as_ref().unwrap())?); | ||
|
|
||
| let table_parquet_options = to_table_parquet_options_from_ffo(file_format_options); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I prefer we |
||
|
|
||
| let scan = DeltaScanBuilder::new(snapshot, log_store, session) | ||
| .with_filter(Some(expression.clone())) | ||
| .with_projection(Some(&used_columns)) | ||
| .with_scan_config(scan_config) | ||
| .with_parquet_options(table_parquet_options) | ||
| .build() | ||
| .await?; | ||
| let scan = Arc::new(scan); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -59,6 +59,7 @@ use crate::kernel::{Action, Add, EagerSnapshot, Remove}; | |
| use crate::operations::write::writer::{DeltaWriter, WriterConfig}; | ||
| use crate::operations::write::WriterStatsConfig; | ||
| use crate::protocol::{DeltaOperation, SaveMode}; | ||
| use crate::table::file_format_options::{to_table_parquet_options_from_ffo, FileFormatRef}; | ||
| use crate::{ensure_table_uri, DeltaTable}; | ||
| use crate::{logstore::LogStoreRef, DeltaResult, DeltaTableError}; | ||
|
|
||
|
|
@@ -373,6 +374,7 @@ pub(crate) struct DeltaScanBuilder<'a> { | |
| limit: Option<usize>, | ||
| files: Option<&'a [Add]>, | ||
| config: Option<DeltaScanConfig>, | ||
| parquet_options: Option<TableParquetOptions>, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suggest we remove this here. We can move all this logic into: DeltaScanConfigBuilder.build(), there we can introspect the TableLoadConfig and set the parquetOptions on the DeltaScanConfig |
||
| } | ||
|
|
||
| impl<'a> DeltaScanBuilder<'a> { | ||
|
|
@@ -390,9 +392,15 @@ impl<'a> DeltaScanBuilder<'a> { | |
| limit: None, | ||
| files: None, | ||
| config: None, | ||
| parquet_options: None, | ||
| } | ||
| } | ||
|
|
||
| pub fn with_parquet_options(mut self, parquet_options: Option<TableParquetOptions>) -> Self { | ||
| self.parquet_options = parquet_options; | ||
| self | ||
| } | ||
|
|
||
| pub fn with_filter(mut self, filter: Option<Expr>) -> Self { | ||
| self.filter = filter; | ||
| self | ||
|
|
@@ -643,13 +651,30 @@ impl<'a> DeltaScanBuilder<'a> { | |
|
|
||
| let stats = stats.unwrap_or(Statistics::new_unknown(&schema)); | ||
|
|
||
| let parquet_options = TableParquetOptions { | ||
| global: self.session.config().options().execution.parquet.clone(), | ||
| ..Default::default() | ||
| }; | ||
| let parquet_options = self | ||
| .parquet_options | ||
| .unwrap_or_else(|| self.session.table_options().parquet.clone()); | ||
|
|
||
| // We have to set the encryption factory on the ParquetSource based on the Parquet options, | ||
| // as this is usually handled by the ParquetFormat type in DataFusion, | ||
| // which is not used in delta-rs. | ||
| let encryption_factory = parquet_options | ||
| .crypto | ||
| .factory_id | ||
| .as_ref() | ||
| .map(|factory_id| { | ||
| self.session | ||
| .runtime_env() | ||
| .parquet_encryption_factory(factory_id) | ||
| }) | ||
| .transpose()?; | ||
|
|
||
| let mut file_source = ParquetSource::new(parquet_options); | ||
|
|
||
| if let Some(encryption_factory) = encryption_factory { | ||
| file_source = file_source.with_encryption_factory(encryption_factory); | ||
| } | ||
|
|
||
| // Sometimes (i.e Merge) we want to prune files that don't make the | ||
| // filter and read the entire contents for files that do match the | ||
| // filter | ||
|
|
@@ -730,9 +755,17 @@ impl TableProvider for DeltaTable { | |
| limit: Option<usize>, | ||
| ) -> Result<Arc<dyn ExecutionPlan>> { | ||
| register_store(self.log_store(), session.runtime_env().as_ref()); | ||
| if let Some(format_options) = &self.config.file_format_options { | ||
| format_options.update_session(session)?; | ||
| } | ||
| let filter_expr = conjunction(filters.iter().cloned()); | ||
|
|
||
| let scan = DeltaScanBuilder::new(self.snapshot()?.snapshot(), self.log_store(), session) | ||
| .with_parquet_options( | ||
| crate::table::file_format_options::to_table_parquet_options_from_ffo( | ||
| self.config.file_format_options.as_ref(), | ||
| ), | ||
| ) | ||
|
Comment on lines
+758
to
+768
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be removed since this will happen inside the DeltaScanBuilder with my suggestion above |
||
| .with_projection(projection) | ||
| .with_limit(limit) | ||
| .with_filter(filter_expr) | ||
|
|
@@ -763,6 +796,7 @@ pub struct DeltaTableProvider { | |
| config: DeltaScanConfig, | ||
| schema: Arc<Schema>, | ||
| files: Option<Vec<Add>>, | ||
| file_format_options: Option<FileFormatRef>, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, not needed anymore, since we can pass it through the DeltaScanConfig |
||
| } | ||
|
|
||
| impl DeltaTableProvider { | ||
|
|
@@ -778,9 +812,15 @@ impl DeltaTableProvider { | |
| log_store, | ||
| config, | ||
| files: None, | ||
| file_format_options: None, | ||
| }) | ||
| } | ||
|
|
||
| pub fn with_file_format_options(mut self, file_format_options: Option<FileFormatRef>) -> Self { | ||
| self.file_format_options = file_format_options; | ||
| self | ||
| } | ||
|
|
||
| /// Define which files to consider while building a scan, for advanced usecases | ||
| pub fn with_files(mut self, files: Vec<Add>) -> DeltaTableProvider { | ||
| self.files = Some(files); | ||
|
|
@@ -818,9 +858,17 @@ impl TableProvider for DeltaTableProvider { | |
| limit: Option<usize>, | ||
| ) -> Result<Arc<dyn ExecutionPlan>> { | ||
| register_store(self.log_store.clone(), session.runtime_env().as_ref()); | ||
| if let Some(format_options) = &self.file_format_options { | ||
| format_options.update_session(session)?; | ||
| } | ||
|
|
||
| let filter_expr = conjunction(filters.iter().cloned()); | ||
|
|
||
| let table_parquet_options = | ||
| to_table_parquet_options_from_ffo(self.file_format_options.as_ref()); | ||
|
|
||
| let mut scan = DeltaScanBuilder::new(&self.snapshot, self.log_store.clone(), session) | ||
| .with_parquet_options(table_parquet_options) | ||
| .with_projection(projection) | ||
| .with_limit(limit) | ||
| .with_filter(filter_expr) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not needed anymore, since you can access the snapshot.load_config() to access the
file_format_options