From bf0046374ebc791846b8afa03fc361ced737d4a0 Mon Sep 17 00:00:00 2001 From: Anoop Narang Date: Fri, 20 Mar 2026 12:27:43 +0530 Subject: [PATCH 1/3] feat: support list/array column types in DuckLake type mapping Map list types to Arrow's DataType::List with recursive element type resolution. Supports three syntaxes: list, array, and type[] (Postgres style). Nested complex element types are rejected with a clear error until struct/map support is added. --- src/types.rs | 211 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 153 insertions(+), 58 deletions(-) diff --git a/src/types.rs b/src/types.rs index 0b4bd63..18c4250 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1,8 +1,6 @@ //! Type mapping from DuckLake types to Arrow types use std::collections::HashMap; - -#[cfg(test)] use std::sync::Arc; use crate::metadata_provider::DuckLakeTableColumn; @@ -20,6 +18,11 @@ pub fn ducklake_to_arrow_type(ducklake_type: &str) -> Result { return Ok(decimal_params); } + // Handle list/array types + if let Some(list_type) = parse_list_type(&normalized)? { + return Ok(list_type); + } + // Handle basic types match normalized.as_str() { // Boolean @@ -68,13 +71,8 @@ pub fn ducklake_to_arrow_type(ducklake_type: &str) -> Result { "timetz" | "time with time zone" => Ok(DataType::Utf8), _ => { - // Check for complex types (list, struct, map) - if normalized.starts_with("list") || normalized.starts_with("array") { - Err(DuckLakeError::UnsupportedType(format!( - "Complex type '{}' not yet supported. Please open an issue at https://github.com/hotdata-dev/datafusion-ducklake if you need this feature.", - ducklake_type - ))) - } else if normalized.starts_with("struct") { + // Check for complex types (struct, map) + if normalized.starts_with("struct") { Err(DuckLakeError::UnsupportedType(format!( "Struct type '{}' not yet supported. Please open an issue at https://github.com/hotdata-dev/datafusion-ducklake if you need this feature.", ducklake_type @@ -139,12 +137,14 @@ pub fn arrow_to_ducklake_type(arrow_type: &DataType) -> Result { // Null type - map to varchar as there's no direct equivalent DataType::Null => Ok("varchar".to_string()), - // Complex types - not yet supported for writing - DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _) => { - Err(DuckLakeError::UnsupportedType(format!( - "List type '{}' not yet supported for writing", - arrow_type - ))) + // List types + DataType::List(field) | DataType::LargeList(field) => { + let inner = arrow_to_ducklake_type(field.data_type())?; + Ok(format!("list<{}>", inner)) + }, + DataType::FixedSizeList(field, _) => { + let inner = arrow_to_ducklake_type(field.data_type())?; + Ok(format!("list<{}>", inner)) }, DataType::Struct(_) => Err(DuckLakeError::UnsupportedType(format!( "Struct type '{}' not yet supported for writing", @@ -250,6 +250,52 @@ fn parse_decimal(type_str: &str) -> Result> { } } +/// Parse list/array type syntax and return `DataType::List` if matched. +/// +/// Supported formats: +/// - `list` / `array` (DuckDB style) +/// - `element_type[]` (Postgres style, e.g. `varchar[]`, `float[]`) +/// +/// Only simple (non-nested) element types are supported. +fn parse_list_type(type_str: &str) -> Result> { + let inner = if type_str.starts_with("list<") || type_str.starts_with("array<") { + // list or array + let start = type_str.find('<').unwrap(); + if !type_str.ends_with('>') { + return Ok(None); + } + &type_str[start + 1..type_str.len() - 1] + } else if type_str.ends_with("[]") { + // type[] + &type_str[..type_str.len() - 2] + } else { + return Ok(None); + }; + + let inner = inner.trim(); + if inner.is_empty() { + return Err(DuckLakeError::UnsupportedType(format!( + "List type '{}' has empty element type", + type_str + ))); + } + + // Only support simple (non-nested) element types + if inner.contains('<') || inner.contains('[') || inner.contains('{') { + return Err(DuckLakeError::UnsupportedType(format!( + "Nested complex type '{}' not yet supported", + type_str + ))); + } + + let element_type = ducklake_to_arrow_type(inner)?; + Ok(Some(DataType::List(Arc::new(Field::new( + "item", + element_type, + true, + ))))) +} + /// Normalize a DuckLake type string to its canonical form. /// /// Converts aliases and case variants to the canonical DuckLake type string. @@ -567,34 +613,61 @@ mod tests { } #[test] - fn test_unsupported_list_type_errors() { - // Test list type returns error - let result = ducklake_to_arrow_type("list"); - assert!(result.is_err()); - match result { - Err(DuckLakeError::UnsupportedType(msg)) => { - assert!(msg.contains("list")); - assert!(msg.contains("not yet supported")); - assert!(msg.contains("open an issue")); - }, - _ => panic!("Expected UnsupportedType error for list type"), + fn test_list_type_angle_bracket() { + let result = ducklake_to_arrow_type("list").unwrap(); + let expected = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); + assert_eq!(result, expected); + } + + #[test] + fn test_list_type_various_elements() { + let cases = vec![ + ("list", DataType::Utf8), + ("list", DataType::Float64), + ("list", DataType::Boolean), + ("list", DataType::Date32), + ]; + for (type_str, expected_inner) in cases { + let result = ducklake_to_arrow_type(type_str).unwrap(); + let expected = + DataType::List(Arc::new(Field::new("item", expected_inner.clone(), true))); + assert_eq!(result, expected, "Failed for {}", type_str); } } #[test] - fn test_unsupported_array_type_errors() { - // Test array type returns error - let result = ducklake_to_arrow_type("array"); - assert!(result.is_err()); - match result { - Err(DuckLakeError::UnsupportedType(msg)) => { - assert!(msg.contains("array")); - assert!(msg.contains("not yet supported")); - }, - _ => panic!("Expected UnsupportedType error for array type"), + fn test_array_type_angle_bracket() { + let result = ducklake_to_arrow_type("array").unwrap(); + let expected = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); + assert_eq!(result, expected); + } + + #[test] + fn test_list_type_postgres_bracket_syntax() { + let cases = vec![ + ("varchar[]", DataType::Utf8), + ("float64[]", DataType::Float64), + ("int32[]", DataType::Int32), + ("boolean[]", DataType::Boolean), + ("bigint[]", DataType::Int64), + ("text[]", DataType::Utf8), + ("float[]", DataType::Float32), + ("integer[]", DataType::Int32), + ]; + for (type_str, expected_inner) in cases { + let result = ducklake_to_arrow_type(type_str).unwrap(); + let expected = + DataType::List(Arc::new(Field::new("item", expected_inner.clone(), true))); + assert_eq!(result, expected, "Failed for {}", type_str); } } + #[test] + fn test_list_type_empty_element_errors() { + assert!(ducklake_to_arrow_type("list<>").is_err()); + assert!(ducklake_to_arrow_type("[]").is_err()); + } + #[test] fn test_unsupported_struct_type_errors() { // Test struct type returns error @@ -627,16 +700,20 @@ mod tests { #[test] fn test_nested_complex_types_error() { - // Test nested complex types return error + // Nested complex types return error let result = ducklake_to_arrow_type("list>"); assert!(result.is_err()); match result { Err(DuckLakeError::UnsupportedType(msg)) => { - assert!(msg.contains("list>")); + assert!(msg.contains("Nested complex type")); assert!(msg.contains("not yet supported")); }, _ => panic!("Expected UnsupportedType error for nested complex type"), } + + // Nested list also errors + assert!(ducklake_to_arrow_type("list>").is_err()); + assert!(ducklake_to_arrow_type("int32[][]").is_err()); } #[test] @@ -738,6 +815,8 @@ mod tests { DataType::Date32, DataType::Timestamp(TimeUnit::Microsecond, None), DataType::Decimal128(10, 2), + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), ]; for original in test_types { @@ -748,17 +827,22 @@ mod tests { } #[test] - fn test_arrow_to_ducklake_unsupported_list() { + fn test_arrow_to_ducklake_list() { let list_type = DataType::List(Arc::new(Field::new("item", DataType::Int32, true))); - let result = arrow_to_ducklake_type(&list_type); - assert!(result.is_err()); - match result { - Err(DuckLakeError::UnsupportedType(msg)) => { - assert!(msg.contains("List type")); - assert!(msg.contains("not yet supported")); - }, - _ => panic!("Expected UnsupportedType error"), - } + assert_eq!(arrow_to_ducklake_type(&list_type).unwrap(), "list"); + + let list_type = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); + assert_eq!( + arrow_to_ducklake_type(&list_type).unwrap(), + "list" + ); + + let large_list = + DataType::LargeList(Arc::new(Field::new("item", DataType::Float64, true))); + assert_eq!( + arrow_to_ducklake_type(&large_list).unwrap(), + "list" + ); } #[test] @@ -885,8 +969,7 @@ mod tests { } #[test] - fn test_build_schema_with_unsupported_type() { - // Test that build_arrow_schema propagates complex type errors + fn test_build_schema_with_list_type() { let columns = vec![ DuckLakeTableColumn { column_id: 1, @@ -896,20 +979,32 @@ mod tests { }, DuckLakeTableColumn { column_id: 2, - column_name: "data".to_string(), - column_type: "list".to_string(), + column_name: "tags".to_string(), + column_type: "list".to_string(), is_nullable: true, }, ]; + let schema = build_arrow_schema(&columns).unwrap(); + assert_eq!(schema.fields().len(), 2); + assert_eq!( + *schema.field(1).data_type(), + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))) + ); + } + + #[test] + fn test_build_schema_with_unsupported_type() { + // Test that build_arrow_schema propagates complex type errors + let columns = vec![DuckLakeTableColumn { + column_id: 1, + column_name: "data".to_string(), + column_type: "struct".to_string(), + is_nullable: true, + }]; + let result = build_arrow_schema(&columns); assert!(result.is_err()); - match result { - Err(DuckLakeError::UnsupportedType(msg)) => { - assert!(msg.contains("list")); - }, - _ => panic!("Expected UnsupportedType error when building schema with complex type"), - } } #[test] From 72c2e9762e17206d103fb166cadf19635f827eab Mon Sep 17 00:00:00 2001 From: Anoop Narang Date: Fri, 20 Mar 2026 12:51:07 +0530 Subject: [PATCH 2/3] fix: reconstruct list types from parent-child column rows in all metadata providers DuckLake stores list columns as parent-child rows in ducklake_column (parent has column_type="list", child has the element type with parent_column pointing to the parent). All four metadata providers now read parent_column, rewrite the parent's type to list, and drop the child row from the result. --- src/metadata_provider.rs | 229 +++++++++++++++++++++++++++++- src/metadata_provider_duckdb.rs | 37 +++-- src/metadata_provider_mysql.rs | 49 ++++--- src/metadata_provider_postgres.rs | 49 ++++--- src/metadata_provider_sqlite.rs | 49 ++++--- 5 files changed, 343 insertions(+), 70 deletions(-) diff --git a/src/metadata_provider.rs b/src/metadata_provider.rs index 49f3c58..f254733 100644 --- a/src/metadata_provider.rs +++ b/src/metadata_provider.rs @@ -17,7 +17,7 @@ pub const SQL_LIST_TABLES: &str = AND ? >= begin_snapshot AND (? < end_snapshot OR end_snapshot IS NULL)"; -pub const SQL_GET_TABLE_COLUMNS: &str = "SELECT column_id, column_name, column_type, nulls_allowed +pub const SQL_GET_TABLE_COLUMNS: &str = "SELECT column_id, column_name, column_type, nulls_allowed, parent_column FROM ducklake_column WHERE table_id = ? AND end_snapshot IS NULL ORDER BY column_order"; @@ -218,7 +218,8 @@ pub const SQL_LIST_ALL_COLUMNS: &str = " c.column_id, c.column_name, c.column_type, - c.nulls_allowed + c.nulls_allowed, + c.parent_column FROM ducklake_schema s JOIN ducklake_table t ON s.schema_id = t.schema_id JOIN ducklake_column c ON t.table_id = c.table_id @@ -356,6 +357,99 @@ impl DuckLakeTableColumn { } } +/// Reconstruct list types from parent-child column rows. +/// +/// DuckLake stores list columns as two rows in `ducklake_column`: +/// - Parent row: `column_type = "list"`, `parent_column = NULL` +/// - Child row: `column_type = ""`, `parent_column = ` +/// +/// This function rewrites the parent's `column_type` to `list` +/// and removes child rows from the result. +/// +/// Only handles `list` parent types. Struct, map, etc. are left unchanged. +pub fn reconstruct_list_columns( + rows: Vec<(DuckLakeTableColumn, Option)>, +) -> Vec { + use std::collections::HashMap; + + // Index: column_id -> position in rows + let id_to_index: HashMap = rows + .iter() + .enumerate() + .map(|(i, (col, _))| (col.column_id, i)) + .collect(); + + // Separate into columns and parent_column arrays + let mut columns: Vec = Vec::with_capacity(rows.len()); + let mut parent_columns: Vec> = Vec::with_capacity(rows.len()); + for (col, parent) in rows { + columns.push(col); + parent_columns.push(parent); + } + + // Find children of list parents and rewrite parent types + let mut skip: std::collections::HashSet = std::collections::HashSet::new(); + for (i, parent_id) in parent_columns.iter().enumerate() { + if let Some(pid) = parent_id { + if let Some(&parent_idx) = id_to_index.get(pid) { + if columns[parent_idx].column_type == "list" { + columns[parent_idx].column_type = + format!("list<{}>", columns[i].column_type); + skip.insert(i); + } + } + } + } + + // Return only top-level columns (not children) + columns + .into_iter() + .enumerate() + .filter(|(i, _)| !skip.contains(i)) + .map(|(_, col)| col) + .collect() +} + +/// Same as [`reconstruct_list_columns`] but for [`ColumnWithTable`] rows. +pub fn reconstruct_list_columns_with_table( + rows: Vec<(ColumnWithTable, Option)>, +) -> Vec { + use std::collections::HashMap; + + let id_to_index: HashMap = rows + .iter() + .enumerate() + .map(|(i, (cwt, _))| (cwt.column.column_id, i)) + .collect(); + + let mut entries: Vec = Vec::with_capacity(rows.len()); + let mut parent_columns: Vec> = Vec::with_capacity(rows.len()); + for (cwt, parent) in rows { + entries.push(cwt); + parent_columns.push(parent); + } + + let mut skip: std::collections::HashSet = std::collections::HashSet::new(); + for (i, parent_id) in parent_columns.iter().enumerate() { + if let Some(pid) = parent_id { + if let Some(&parent_idx) = id_to_index.get(pid) { + if entries[parent_idx].column.column_type == "list" { + entries[parent_idx].column.column_type = + format!("list<{}>", entries[i].column.column_type); + skip.insert(i); + } + } + } + } + + entries + .into_iter() + .enumerate() + .filter(|(i, _)| !skip.contains(i)) + .map(|(_, e)| e) + .collect() +} + /// Metadata for a data file or delete file in DuckLake #[derive(Debug, Clone)] pub struct DuckLakeFileData { @@ -534,3 +628,134 @@ where { tokio::task::block_in_place(|| tokio::runtime::Handle::current().block_on(f)) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_reconstruct_list_columns_basic() { + let rows = vec![ + ( + DuckLakeTableColumn::new(1, "id".into(), "int64".into(), false), + None, + ), + ( + DuckLakeTableColumn::new(6, "vector".into(), "list".into(), true), + None, + ), + ( + DuckLakeTableColumn::new(7, "element".into(), "float64".into(), true), + Some(6), + ), + ]; + + let result = reconstruct_list_columns(rows); + assert_eq!(result.len(), 2); + assert_eq!(result[0].column_name, "id"); + assert_eq!(result[0].column_type, "int64"); + assert_eq!(result[1].column_name, "vector"); + assert_eq!(result[1].column_type, "list"); + } + + #[test] + fn test_reconstruct_list_columns_no_lists() { + let rows = vec![ + ( + DuckLakeTableColumn::new(1, "id".into(), "int64".into(), false), + None, + ), + ( + DuckLakeTableColumn::new(2, "name".into(), "varchar".into(), true), + None, + ), + ]; + + let result = reconstruct_list_columns(rows); + assert_eq!(result.len(), 2); + assert_eq!(result[0].column_type, "int64"); + assert_eq!(result[1].column_type, "varchar"); + } + + #[test] + fn test_reconstruct_list_columns_struct_parent_unchanged() { + // Struct parents should NOT be rewritten — child stays in result + let rows = vec![ + ( + DuckLakeTableColumn::new(1, "data".into(), "struct".into(), true), + None, + ), + ( + DuckLakeTableColumn::new(2, "field_a".into(), "int32".into(), true), + Some(1), + ), + ]; + + let result = reconstruct_list_columns(rows); + assert_eq!(result.len(), 2); // both remain + assert_eq!(result[0].column_type, "struct"); // unchanged + } + + #[test] + fn test_reconstruct_list_columns_multiple_lists() { + let rows = vec![ + ( + DuckLakeTableColumn::new(1, "tags".into(), "list".into(), true), + None, + ), + ( + DuckLakeTableColumn::new(2, "element".into(), "varchar".into(), true), + Some(1), + ), + ( + DuckLakeTableColumn::new(3, "scores".into(), "list".into(), true), + None, + ), + ( + DuckLakeTableColumn::new(4, "element".into(), "float64".into(), true), + Some(3), + ), + ]; + + let result = reconstruct_list_columns(rows); + assert_eq!(result.len(), 2); + assert_eq!(result[0].column_type, "list"); + assert_eq!(result[1].column_type, "list"); + } + + #[test] + fn test_reconstruct_list_columns_with_table_basic() { + let rows = vec![ + ( + ColumnWithTable { + schema_name: "main".into(), + table_name: "t".into(), + column: DuckLakeTableColumn::new( + 6, + "vector".into(), + "list".into(), + true, + ), + }, + None, + ), + ( + ColumnWithTable { + schema_name: "main".into(), + table_name: "t".into(), + column: DuckLakeTableColumn::new( + 7, + "element".into(), + "float64".into(), + true, + ), + }, + Some(6), + ), + ]; + + let result = reconstruct_list_columns_with_table(rows); + assert_eq!(result.len(), 1); + assert_eq!(result[0].column.column_type, "list"); + } +} diff --git a/src/metadata_provider_duckdb.rs b/src/metadata_provider_duckdb.rs index 5d39e49..b2a8c40 100644 --- a/src/metadata_provider_duckdb.rs +++ b/src/metadata_provider_duckdb.rs @@ -7,6 +7,7 @@ use crate::metadata_provider::{ SQL_GET_TABLE_BY_NAME, SQL_GET_TABLE_COLUMNS, SQL_LIST_ALL_COLUMNS, SQL_LIST_ALL_FILES, SQL_LIST_ALL_TABLES, SQL_LIST_SCHEMAS, SQL_LIST_SNAPSHOTS, SQL_LIST_TABLES, SQL_TABLE_EXISTS, SchemaMetadata, SnapshotMetadata, TableMetadata, TableWithSchema, + reconstruct_list_columns, reconstruct_list_columns_with_table, }; use duckdb::AccessMode::ReadOnly; use duckdb::{Config, Connection, params}; @@ -145,22 +146,26 @@ impl MetadataProvider for DuckdbMetadataProvider { let conn = self.connection(); let mut stmt = conn.prepare(SQL_GET_TABLE_COLUMNS)?; - let columns = stmt + let raw_columns: Vec<(DuckLakeTableColumn, Option)> = stmt .query_map([table_id], |row| { let column_id: i64 = row.get(0)?; let column_name: String = row.get(1)?; let column_type: String = row.get(2)?; let nulls_allowed: Option = row.get(3)?; - Ok(DuckLakeTableColumn::new( - column_id, - column_name, - column_type, - nulls_allowed.unwrap_or(true), + let parent_column: Option = row.get(4)?; + Ok(( + DuckLakeTableColumn::new( + column_id, + column_name, + column_type, + nulls_allowed.unwrap_or(true), + ), + parent_column, )) })? .collect::, _>>()?; - Ok(columns) + Ok(reconstruct_list_columns(raw_columns)) } fn get_table_files_for_select( @@ -307,29 +312,33 @@ impl MetadataProvider for DuckdbMetadataProvider { let conn = self.connection(); let mut stmt = conn.prepare(SQL_LIST_ALL_COLUMNS)?; - let columns = stmt + let raw_columns: Vec<(ColumnWithTable, Option)> = stmt .query_map( params![snapshot_id, snapshot_id, snapshot_id, snapshot_id], |row| { let schema_name: String = row.get(0)?; let table_name: String = row.get(1)?; let nulls_allowed: Option = row.get(5)?; + let parent_column: Option = row.get(6)?; let column = DuckLakeTableColumn { column_id: row.get(2)?, column_name: row.get(3)?, column_type: row.get(4)?, is_nullable: nulls_allowed.unwrap_or(true), }; - Ok(ColumnWithTable { - schema_name, - table_name, - column, - }) + Ok(( + ColumnWithTable { + schema_name, + table_name, + column, + }, + parent_column, + )) }, )? .collect::, _>>()?; - Ok(columns) + Ok(reconstruct_list_columns_with_table(raw_columns)) } fn list_all_files(&self, snapshot_id: i64) -> crate::Result> { diff --git a/src/metadata_provider_mysql.rs b/src/metadata_provider_mysql.rs index 08ea16d..d53e0db 100644 --- a/src/metadata_provider_mysql.rs +++ b/src/metadata_provider_mysql.rs @@ -4,7 +4,8 @@ use crate::Result; use crate::metadata_provider::{ ColumnWithTable, DataFileChange, DeleteFileChange, DuckLakeFileData, DuckLakeTableColumn, DuckLakeTableFile, FileWithTable, MetadataProvider, SchemaMetadata, SnapshotMetadata, - TableMetadata, TableWithSchema, block_on, + TableMetadata, TableWithSchema, block_on, reconstruct_list_columns, + reconstruct_list_columns_with_table, }; use sqlx::Row; use sqlx::mysql::{MySqlPool, MySqlPoolOptions}; @@ -139,7 +140,7 @@ impl MetadataProvider for MySqlMetadataProvider { fn get_table_structure(&self, table_id: i64) -> Result> { block_on(async { let rows = sqlx::query( - "SELECT column_id, column_name, column_type, nulls_allowed + "SELECT column_id, column_name, column_type, nulls_allowed, parent_column FROM ducklake_column WHERE table_id = ? AND end_snapshot IS NULL ORDER BY column_order", @@ -148,17 +149,23 @@ impl MetadataProvider for MySqlMetadataProvider { .fetch_all(&self.pool) .await?; - rows.into_iter() + let raw: Result)>> = rows + .into_iter() .map(|row| { let nulls_allowed: Option = row.try_get(3)?; - Ok(DuckLakeTableColumn { - column_id: row.try_get(0)?, - column_name: row.try_get(1)?, - column_type: row.try_get(2)?, - is_nullable: nulls_allowed.unwrap_or(true), - }) + let parent_column: Option = row.try_get(4)?; + Ok(( + DuckLakeTableColumn { + column_id: row.try_get(0)?, + column_name: row.try_get(1)?, + column_type: row.try_get(2)?, + is_nullable: nulls_allowed.unwrap_or(true), + }, + parent_column, + )) }) - .collect() + .collect(); + Ok(reconstruct_list_columns(raw?)) }) } @@ -358,7 +365,7 @@ impl MetadataProvider for MySqlMetadataProvider { fn list_all_columns(&self, snapshot_id: i64) -> Result> { block_on(async { let rows = sqlx::query( - "SELECT s.schema_name, t.table_name, c.column_id, c.column_name, c.column_type, c.nulls_allowed + "SELECT s.schema_name, t.table_name, c.column_id, c.column_name, c.column_type, c.nulls_allowed, c.parent_column FROM ducklake_schema s JOIN ducklake_table t ON s.schema_id = t.schema_id JOIN ducklake_column c ON t.table_id = c.table_id @@ -375,24 +382,30 @@ impl MetadataProvider for MySqlMetadataProvider { .fetch_all(&self.pool) .await?; - rows.into_iter() + let raw: Result)>> = rows + .into_iter() .map(|row| { let schema_name: String = row.try_get(0)?; let table_name: String = row.try_get(1)?; let nulls_allowed: Option = row.try_get(5)?; + let parent_column: Option = row.try_get(6)?; let column = DuckLakeTableColumn { column_id: row.try_get(2)?, column_name: row.try_get(3)?, column_type: row.try_get(4)?, is_nullable: nulls_allowed.unwrap_or(true), }; - Ok(ColumnWithTable { - schema_name, - table_name, - column, - }) + Ok(( + ColumnWithTable { + schema_name, + table_name, + column, + }, + parent_column, + )) }) - .collect() + .collect(); + Ok(reconstruct_list_columns_with_table(raw?)) }) } diff --git a/src/metadata_provider_postgres.rs b/src/metadata_provider_postgres.rs index 568f276..d349d75 100644 --- a/src/metadata_provider_postgres.rs +++ b/src/metadata_provider_postgres.rs @@ -4,7 +4,8 @@ use crate::Result; use crate::metadata_provider::{ ColumnWithTable, DataFileChange, DeleteFileChange, DuckLakeFileData, DuckLakeTableColumn, DuckLakeTableFile, FileWithTable, MetadataProvider, SchemaMetadata, SnapshotMetadata, - TableMetadata, TableWithSchema, block_on, + TableMetadata, TableWithSchema, block_on, reconstruct_list_columns, + reconstruct_list_columns_with_table, }; use sqlx::Row; use sqlx::postgres::{PgPool, PgPoolOptions}; @@ -173,7 +174,7 @@ impl MetadataProvider for PostgresMetadataProvider { fn get_table_structure(&self, table_id: i64) -> Result> { block_on(async { let rows = sqlx::query( - "SELECT column_id, column_name, column_type, nulls_allowed + "SELECT column_id, column_name, column_type, nulls_allowed, parent_column FROM ducklake_column WHERE table_id = $1 AND end_snapshot IS NULL ORDER BY column_order", @@ -182,17 +183,23 @@ impl MetadataProvider for PostgresMetadataProvider { .fetch_all(&self.pool) .await?; - rows.into_iter() + let raw: Result)>> = rows + .into_iter() .map(|row| { let nulls_allowed: Option = row.try_get(3)?; - Ok(DuckLakeTableColumn { - column_id: row.try_get(0)?, - column_name: row.try_get(1)?, - column_type: row.try_get(2)?, - is_nullable: nulls_allowed.unwrap_or(true), - }) + let parent_column: Option = row.try_get(4)?; + Ok(( + DuckLakeTableColumn { + column_id: row.try_get(0)?, + column_name: row.try_get(1)?, + column_type: row.try_get(2)?, + is_nullable: nulls_allowed.unwrap_or(true), + }, + parent_column, + )) }) - .collect() + .collect(); + Ok(reconstruct_list_columns(raw?)) }) } @@ -391,7 +398,7 @@ impl MetadataProvider for PostgresMetadataProvider { fn list_all_columns(&self, snapshot_id: i64) -> Result> { block_on(async { let rows = sqlx::query( - "SELECT s.schema_name, t.table_name, c.column_id, c.column_name, c.column_type, c.nulls_allowed + "SELECT s.schema_name, t.table_name, c.column_id, c.column_name, c.column_type, c.nulls_allowed, c.parent_column FROM ducklake_schema s JOIN ducklake_table t ON s.schema_id = t.schema_id JOIN ducklake_column c ON t.table_id = c.table_id @@ -408,24 +415,30 @@ impl MetadataProvider for PostgresMetadataProvider { .fetch_all(&self.pool) .await?; - rows.into_iter() + let raw: Result)>> = rows + .into_iter() .map(|row| { let schema_name: String = row.try_get(0)?; let table_name: String = row.try_get(1)?; let nulls_allowed: Option = row.try_get(5)?; + let parent_column: Option = row.try_get(6)?; let column = DuckLakeTableColumn { column_id: row.try_get(2)?, column_name: row.try_get(3)?, column_type: row.try_get(4)?, is_nullable: nulls_allowed.unwrap_or(true), }; - Ok(ColumnWithTable { - schema_name, - table_name, - column, - }) + Ok(( + ColumnWithTable { + schema_name, + table_name, + column, + }, + parent_column, + )) }) - .collect() + .collect(); + Ok(reconstruct_list_columns_with_table(raw?)) }) } diff --git a/src/metadata_provider_sqlite.rs b/src/metadata_provider_sqlite.rs index 409b994..3254c49 100644 --- a/src/metadata_provider_sqlite.rs +++ b/src/metadata_provider_sqlite.rs @@ -4,7 +4,8 @@ use crate::Result; use crate::metadata_provider::{ ColumnWithTable, DataFileChange, DeleteFileChange, DuckLakeFileData, DuckLakeTableColumn, DuckLakeTableFile, FileWithTable, MetadataProvider, SchemaMetadata, SnapshotMetadata, - TableMetadata, TableWithSchema, block_on, + TableMetadata, TableWithSchema, block_on, reconstruct_list_columns, + reconstruct_list_columns_with_table, }; use sqlx::Row; use sqlx::sqlite::{SqlitePool, SqlitePoolOptions}; @@ -140,7 +141,7 @@ impl MetadataProvider for SqliteMetadataProvider { fn get_table_structure(&self, table_id: i64) -> Result> { block_on(async { let rows = sqlx::query( - "SELECT column_id, column_name, column_type, nulls_allowed + "SELECT column_id, column_name, column_type, nulls_allowed, parent_column FROM ducklake_column WHERE table_id = ? AND end_snapshot IS NULL ORDER BY column_order", @@ -149,17 +150,23 @@ impl MetadataProvider for SqliteMetadataProvider { .fetch_all(&self.pool) .await?; - rows.into_iter() + let raw: Result)>> = rows + .into_iter() .map(|row| { let nulls_allowed: Option = row.try_get(3)?; - Ok(DuckLakeTableColumn { - column_id: row.try_get(0)?, - column_name: row.try_get(1)?, - column_type: row.try_get(2)?, - is_nullable: nulls_allowed.unwrap_or(true), - }) + let parent_column: Option = row.try_get(4)?; + Ok(( + DuckLakeTableColumn { + column_id: row.try_get(0)?, + column_name: row.try_get(1)?, + column_type: row.try_get(2)?, + is_nullable: nulls_allowed.unwrap_or(true), + }, + parent_column, + )) }) - .collect() + .collect(); + Ok(reconstruct_list_columns(raw?)) }) } @@ -357,7 +364,7 @@ impl MetadataProvider for SqliteMetadataProvider { fn list_all_columns(&self, snapshot_id: i64) -> Result> { block_on(async { let rows = sqlx::query( - "SELECT s.schema_name, t.table_name, c.column_id, c.column_name, c.column_type, c.nulls_allowed + "SELECT s.schema_name, t.table_name, c.column_id, c.column_name, c.column_type, c.nulls_allowed, c.parent_column FROM ducklake_schema s JOIN ducklake_table t ON s.schema_id = t.schema_id JOIN ducklake_column c ON t.table_id = c.table_id @@ -374,24 +381,30 @@ impl MetadataProvider for SqliteMetadataProvider { .fetch_all(&self.pool) .await?; - rows.into_iter() + let raw: Result)>> = rows + .into_iter() .map(|row| { let schema_name: String = row.try_get(0)?; let table_name: String = row.try_get(1)?; let nulls_allowed: Option = row.try_get(5)?; + let parent_column: Option = row.try_get(6)?; let column = DuckLakeTableColumn { column_id: row.try_get(2)?, column_name: row.try_get(3)?, column_type: row.try_get(4)?, is_nullable: nulls_allowed.unwrap_or(true), }; - Ok(ColumnWithTable { - schema_name, - table_name, - column, - }) + Ok(( + ColumnWithTable { + schema_name, + table_name, + column, + }, + parent_column, + )) }) - .collect() + .collect(); + Ok(reconstruct_list_columns_with_table(raw?)) }) } From ac6d231d73fad2ea62d86c9ba2744316592a2c42 Mon Sep 17 00:00:00 2001 From: Anoop Narang Date: Fri, 20 Mar 2026 12:56:31 +0530 Subject: [PATCH 3/3] style: fix clippy and fmt warnings --- src/metadata_provider.rs | 46 ++++++++++++--------------------- src/metadata_provider_duckdb.rs | 4 +-- src/types.rs | 12 +++------ 3 files changed, 23 insertions(+), 39 deletions(-) diff --git a/src/metadata_provider.rs b/src/metadata_provider.rs index f254733..808f319 100644 --- a/src/metadata_provider.rs +++ b/src/metadata_provider.rs @@ -17,7 +17,8 @@ pub const SQL_LIST_TABLES: &str = AND ? >= begin_snapshot AND (? < end_snapshot OR end_snapshot IS NULL)"; -pub const SQL_GET_TABLE_COLUMNS: &str = "SELECT column_id, column_name, column_type, nulls_allowed, parent_column +pub const SQL_GET_TABLE_COLUMNS: &str = + "SELECT column_id, column_name, column_type, nulls_allowed, parent_column FROM ducklake_column WHERE table_id = ? AND end_snapshot IS NULL ORDER BY column_order"; @@ -390,14 +391,12 @@ pub fn reconstruct_list_columns( // Find children of list parents and rewrite parent types let mut skip: std::collections::HashSet = std::collections::HashSet::new(); for (i, parent_id) in parent_columns.iter().enumerate() { - if let Some(pid) = parent_id { - if let Some(&parent_idx) = id_to_index.get(pid) { - if columns[parent_idx].column_type == "list" { - columns[parent_idx].column_type = - format!("list<{}>", columns[i].column_type); - skip.insert(i); - } - } + if let Some(pid) = parent_id + && let Some(&parent_idx) = id_to_index.get(pid) + && columns[parent_idx].column_type == "list" + { + columns[parent_idx].column_type = format!("list<{}>", columns[i].column_type); + skip.insert(i); } } @@ -431,14 +430,13 @@ pub fn reconstruct_list_columns_with_table( let mut skip: std::collections::HashSet = std::collections::HashSet::new(); for (i, parent_id) in parent_columns.iter().enumerate() { - if let Some(pid) = parent_id { - if let Some(&parent_idx) = id_to_index.get(pid) { - if entries[parent_idx].column.column_type == "list" { - entries[parent_idx].column.column_type = - format!("list<{}>", entries[i].column.column_type); - skip.insert(i); - } - } + if let Some(pid) = parent_id + && let Some(&parent_idx) = id_to_index.get(pid) + && entries[parent_idx].column.column_type == "list" + { + entries[parent_idx].column.column_type = + format!("list<{}>", entries[i].column.column_type); + skip.insert(i); } } @@ -730,12 +728,7 @@ mod tests { ColumnWithTable { schema_name: "main".into(), table_name: "t".into(), - column: DuckLakeTableColumn::new( - 6, - "vector".into(), - "list".into(), - true, - ), + column: DuckLakeTableColumn::new(6, "vector".into(), "list".into(), true), }, None, ), @@ -743,12 +736,7 @@ mod tests { ColumnWithTable { schema_name: "main".into(), table_name: "t".into(), - column: DuckLakeTableColumn::new( - 7, - "element".into(), - "float64".into(), - true, - ), + column: DuckLakeTableColumn::new(7, "element".into(), "float64".into(), true), }, Some(6), ), diff --git a/src/metadata_provider_duckdb.rs b/src/metadata_provider_duckdb.rs index b2a8c40..0cf6b1e 100644 --- a/src/metadata_provider_duckdb.rs +++ b/src/metadata_provider_duckdb.rs @@ -6,8 +6,8 @@ use crate::metadata_provider::{ SQL_GET_DELETE_FILES_ADDED_BETWEEN_SNAPSHOTS, SQL_GET_LATEST_SNAPSHOT, SQL_GET_SCHEMA_BY_NAME, SQL_GET_TABLE_BY_NAME, SQL_GET_TABLE_COLUMNS, SQL_LIST_ALL_COLUMNS, SQL_LIST_ALL_FILES, SQL_LIST_ALL_TABLES, SQL_LIST_SCHEMAS, SQL_LIST_SNAPSHOTS, SQL_LIST_TABLES, SQL_TABLE_EXISTS, - SchemaMetadata, SnapshotMetadata, TableMetadata, TableWithSchema, - reconstruct_list_columns, reconstruct_list_columns_with_table, + SchemaMetadata, SnapshotMetadata, TableMetadata, TableWithSchema, reconstruct_list_columns, + reconstruct_list_columns_with_table, }; use duckdb::AccessMode::ReadOnly; use duckdb::{Config, Connection, params}; diff --git a/src/types.rs b/src/types.rs index 18c4250..384a300 100644 --- a/src/types.rs +++ b/src/types.rs @@ -265,9 +265,9 @@ fn parse_list_type(type_str: &str) -> Result> { return Ok(None); } &type_str[start + 1..type_str.len() - 1] - } else if type_str.ends_with("[]") { + } else if let Some(stripped) = type_str.strip_suffix("[]") { // type[] - &type_str[..type_str.len() - 2] + stripped } else { return Ok(None); }; @@ -832,13 +832,9 @@ mod tests { assert_eq!(arrow_to_ducklake_type(&list_type).unwrap(), "list"); let list_type = DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))); - assert_eq!( - arrow_to_ducklake_type(&list_type).unwrap(), - "list" - ); + assert_eq!(arrow_to_ducklake_type(&list_type).unwrap(), "list"); - let large_list = - DataType::LargeList(Arc::new(Field::new("item", DataType::Float64, true))); + let large_list = DataType::LargeList(Arc::new(Field::new("item", DataType::Float64, true))); assert_eq!( arrow_to_ducklake_type(&large_list).unwrap(), "list"