Skip to content

Commit c460b25

Browse files
authored
feat: support list/array column types in DuckLake type mapping (#89)
* feat: support list/array column types in DuckLake type mapping Map list types to Arrow's DataType::List with recursive element type resolution. Supports three syntaxes: list<type>, array<type>, and type[] (Postgres style). Nested complex element types are rejected with a clear error until struct/map support is added. * fix: reconstruct list types from parent-child column rows in all metadata providers DuckLake stores list columns as parent-child rows in ducklake_column (parent has column_type="list", child has the element type with parent_column pointing to the parent). All four metadata providers now read parent_column, rewrite the parent's type to list<element_type>, and drop the child row from the result. * style: fix clippy and fmt warnings
1 parent c46f5c6 commit c460b25

6 files changed

Lines changed: 481 additions & 129 deletions

src/metadata_provider.rs

Lines changed: 215 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ pub const SQL_LIST_TABLES: &str =
1717
AND ? >= begin_snapshot
1818
AND (? < end_snapshot OR end_snapshot IS NULL)";
1919

20-
pub const SQL_GET_TABLE_COLUMNS: &str = "SELECT column_id, column_name, column_type, nulls_allowed
20+
pub const SQL_GET_TABLE_COLUMNS: &str =
21+
"SELECT column_id, column_name, column_type, nulls_allowed, parent_column
2122
FROM ducklake_column
2223
WHERE table_id = ? AND end_snapshot IS NULL
2324
ORDER BY column_order";
@@ -218,7 +219,8 @@ pub const SQL_LIST_ALL_COLUMNS: &str = "
218219
c.column_id,
219220
c.column_name,
220221
c.column_type,
221-
c.nulls_allowed
222+
c.nulls_allowed,
223+
c.parent_column
222224
FROM ducklake_schema s
223225
JOIN ducklake_table t ON s.schema_id = t.schema_id
224226
JOIN ducklake_column c ON t.table_id = c.table_id
@@ -356,6 +358,96 @@ impl DuckLakeTableColumn {
356358
}
357359
}
358360

361+
/// Reconstruct list types from parent-child column rows.
362+
///
363+
/// DuckLake stores list columns as two rows in `ducklake_column`:
364+
/// - Parent row: `column_type = "list"`, `parent_column = NULL`
365+
/// - Child row: `column_type = "<element_type>"`, `parent_column = <parent_column_id>`
366+
///
367+
/// This function rewrites the parent's `column_type` to `list<element_type>`
368+
/// and removes child rows from the result.
369+
///
370+
/// Only handles `list` parent types. Struct, map, etc. are left unchanged.
371+
pub fn reconstruct_list_columns(
372+
rows: Vec<(DuckLakeTableColumn, Option<i64>)>,
373+
) -> Vec<DuckLakeTableColumn> {
374+
use std::collections::HashMap;
375+
376+
// Index: column_id -> position in rows
377+
let id_to_index: HashMap<i64, usize> = rows
378+
.iter()
379+
.enumerate()
380+
.map(|(i, (col, _))| (col.column_id, i))
381+
.collect();
382+
383+
// Separate into columns and parent_column arrays
384+
let mut columns: Vec<DuckLakeTableColumn> = Vec::with_capacity(rows.len());
385+
let mut parent_columns: Vec<Option<i64>> = Vec::with_capacity(rows.len());
386+
for (col, parent) in rows {
387+
columns.push(col);
388+
parent_columns.push(parent);
389+
}
390+
391+
// Find children of list parents and rewrite parent types
392+
let mut skip: std::collections::HashSet<usize> = std::collections::HashSet::new();
393+
for (i, parent_id) in parent_columns.iter().enumerate() {
394+
if let Some(pid) = parent_id
395+
&& let Some(&parent_idx) = id_to_index.get(pid)
396+
&& columns[parent_idx].column_type == "list"
397+
{
398+
columns[parent_idx].column_type = format!("list<{}>", columns[i].column_type);
399+
skip.insert(i);
400+
}
401+
}
402+
403+
// Return only top-level columns (not children)
404+
columns
405+
.into_iter()
406+
.enumerate()
407+
.filter(|(i, _)| !skip.contains(i))
408+
.map(|(_, col)| col)
409+
.collect()
410+
}
411+
412+
/// Same as [`reconstruct_list_columns`] but for [`ColumnWithTable`] rows.
413+
pub fn reconstruct_list_columns_with_table(
414+
rows: Vec<(ColumnWithTable, Option<i64>)>,
415+
) -> Vec<ColumnWithTable> {
416+
use std::collections::HashMap;
417+
418+
let id_to_index: HashMap<i64, usize> = rows
419+
.iter()
420+
.enumerate()
421+
.map(|(i, (cwt, _))| (cwt.column.column_id, i))
422+
.collect();
423+
424+
let mut entries: Vec<ColumnWithTable> = Vec::with_capacity(rows.len());
425+
let mut parent_columns: Vec<Option<i64>> = Vec::with_capacity(rows.len());
426+
for (cwt, parent) in rows {
427+
entries.push(cwt);
428+
parent_columns.push(parent);
429+
}
430+
431+
let mut skip: std::collections::HashSet<usize> = std::collections::HashSet::new();
432+
for (i, parent_id) in parent_columns.iter().enumerate() {
433+
if let Some(pid) = parent_id
434+
&& let Some(&parent_idx) = id_to_index.get(pid)
435+
&& entries[parent_idx].column.column_type == "list"
436+
{
437+
entries[parent_idx].column.column_type =
438+
format!("list<{}>", entries[i].column.column_type);
439+
skip.insert(i);
440+
}
441+
}
442+
443+
entries
444+
.into_iter()
445+
.enumerate()
446+
.filter(|(i, _)| !skip.contains(i))
447+
.map(|(_, e)| e)
448+
.collect()
449+
}
450+
359451
/// Metadata for a data file or delete file in DuckLake
360452
#[derive(Debug, Clone)]
361453
pub struct DuckLakeFileData {
@@ -534,3 +626,124 @@ where
534626
{
535627
tokio::task::block_in_place(|| tokio::runtime::Handle::current().block_on(f))
536628
}
629+
630+
#[cfg(test)]
631+
mod tests {
632+
use super::*;
633+
634+
#[test]
635+
fn test_reconstruct_list_columns_basic() {
636+
let rows = vec![
637+
(
638+
DuckLakeTableColumn::new(1, "id".into(), "int64".into(), false),
639+
None,
640+
),
641+
(
642+
DuckLakeTableColumn::new(6, "vector".into(), "list".into(), true),
643+
None,
644+
),
645+
(
646+
DuckLakeTableColumn::new(7, "element".into(), "float64".into(), true),
647+
Some(6),
648+
),
649+
];
650+
651+
let result = reconstruct_list_columns(rows);
652+
assert_eq!(result.len(), 2);
653+
assert_eq!(result[0].column_name, "id");
654+
assert_eq!(result[0].column_type, "int64");
655+
assert_eq!(result[1].column_name, "vector");
656+
assert_eq!(result[1].column_type, "list<float64>");
657+
}
658+
659+
#[test]
660+
fn test_reconstruct_list_columns_no_lists() {
661+
let rows = vec![
662+
(
663+
DuckLakeTableColumn::new(1, "id".into(), "int64".into(), false),
664+
None,
665+
),
666+
(
667+
DuckLakeTableColumn::new(2, "name".into(), "varchar".into(), true),
668+
None,
669+
),
670+
];
671+
672+
let result = reconstruct_list_columns(rows);
673+
assert_eq!(result.len(), 2);
674+
assert_eq!(result[0].column_type, "int64");
675+
assert_eq!(result[1].column_type, "varchar");
676+
}
677+
678+
#[test]
679+
fn test_reconstruct_list_columns_struct_parent_unchanged() {
680+
// Struct parents should NOT be rewritten — child stays in result
681+
let rows = vec![
682+
(
683+
DuckLakeTableColumn::new(1, "data".into(), "struct".into(), true),
684+
None,
685+
),
686+
(
687+
DuckLakeTableColumn::new(2, "field_a".into(), "int32".into(), true),
688+
Some(1),
689+
),
690+
];
691+
692+
let result = reconstruct_list_columns(rows);
693+
assert_eq!(result.len(), 2); // both remain
694+
assert_eq!(result[0].column_type, "struct"); // unchanged
695+
}
696+
697+
#[test]
698+
fn test_reconstruct_list_columns_multiple_lists() {
699+
let rows = vec![
700+
(
701+
DuckLakeTableColumn::new(1, "tags".into(), "list".into(), true),
702+
None,
703+
),
704+
(
705+
DuckLakeTableColumn::new(2, "element".into(), "varchar".into(), true),
706+
Some(1),
707+
),
708+
(
709+
DuckLakeTableColumn::new(3, "scores".into(), "list".into(), true),
710+
None,
711+
),
712+
(
713+
DuckLakeTableColumn::new(4, "element".into(), "float64".into(), true),
714+
Some(3),
715+
),
716+
];
717+
718+
let result = reconstruct_list_columns(rows);
719+
assert_eq!(result.len(), 2);
720+
assert_eq!(result[0].column_type, "list<varchar>");
721+
assert_eq!(result[1].column_type, "list<float64>");
722+
}
723+
724+
#[test]
725+
fn test_reconstruct_list_columns_with_table_basic() {
726+
let rows = vec![
727+
(
728+
ColumnWithTable {
729+
schema_name: "main".into(),
730+
table_name: "t".into(),
731+
column: DuckLakeTableColumn::new(6, "vector".into(), "list".into(), true),
732+
},
733+
None,
734+
),
735+
(
736+
ColumnWithTable {
737+
schema_name: "main".into(),
738+
table_name: "t".into(),
739+
column: DuckLakeTableColumn::new(7, "element".into(), "float64".into(), true),
740+
},
741+
Some(6),
742+
),
743+
];
744+
745+
let result = reconstruct_list_columns_with_table(rows);
746+
assert_eq!(result.len(), 1);
747+
assert_eq!(result[0].column.column_type, "list<float64>");
748+
}
749+
}

src/metadata_provider_duckdb.rs

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ use crate::metadata_provider::{
66
SQL_GET_DELETE_FILES_ADDED_BETWEEN_SNAPSHOTS, SQL_GET_LATEST_SNAPSHOT, SQL_GET_SCHEMA_BY_NAME,
77
SQL_GET_TABLE_BY_NAME, SQL_GET_TABLE_COLUMNS, SQL_LIST_ALL_COLUMNS, SQL_LIST_ALL_FILES,
88
SQL_LIST_ALL_TABLES, SQL_LIST_SCHEMAS, SQL_LIST_SNAPSHOTS, SQL_LIST_TABLES, SQL_TABLE_EXISTS,
9-
SchemaMetadata, SnapshotMetadata, TableMetadata, TableWithSchema,
9+
SchemaMetadata, SnapshotMetadata, TableMetadata, TableWithSchema, reconstruct_list_columns,
10+
reconstruct_list_columns_with_table,
1011
};
1112
use duckdb::AccessMode::ReadOnly;
1213
use duckdb::{Config, Connection, params};
@@ -145,22 +146,26 @@ impl MetadataProvider for DuckdbMetadataProvider {
145146
let conn = self.connection();
146147
let mut stmt = conn.prepare(SQL_GET_TABLE_COLUMNS)?;
147148

148-
let columns = stmt
149+
let raw_columns: Vec<(DuckLakeTableColumn, Option<i64>)> = stmt
149150
.query_map([table_id], |row| {
150151
let column_id: i64 = row.get(0)?;
151152
let column_name: String = row.get(1)?;
152153
let column_type: String = row.get(2)?;
153154
let nulls_allowed: Option<bool> = row.get(3)?;
154-
Ok(DuckLakeTableColumn::new(
155-
column_id,
156-
column_name,
157-
column_type,
158-
nulls_allowed.unwrap_or(true),
155+
let parent_column: Option<i64> = row.get(4)?;
156+
Ok((
157+
DuckLakeTableColumn::new(
158+
column_id,
159+
column_name,
160+
column_type,
161+
nulls_allowed.unwrap_or(true),
162+
),
163+
parent_column,
159164
))
160165
})?
161166
.collect::<Result<Vec<_>, _>>()?;
162167

163-
Ok(columns)
168+
Ok(reconstruct_list_columns(raw_columns))
164169
}
165170

166171
fn get_table_files_for_select(
@@ -307,29 +312,33 @@ impl MetadataProvider for DuckdbMetadataProvider {
307312
let conn = self.connection();
308313
let mut stmt = conn.prepare(SQL_LIST_ALL_COLUMNS)?;
309314

310-
let columns = stmt
315+
let raw_columns: Vec<(ColumnWithTable, Option<i64>)> = stmt
311316
.query_map(
312317
params![snapshot_id, snapshot_id, snapshot_id, snapshot_id],
313318
|row| {
314319
let schema_name: String = row.get(0)?;
315320
let table_name: String = row.get(1)?;
316321
let nulls_allowed: Option<bool> = row.get(5)?;
322+
let parent_column: Option<i64> = row.get(6)?;
317323
let column = DuckLakeTableColumn {
318324
column_id: row.get(2)?,
319325
column_name: row.get(3)?,
320326
column_type: row.get(4)?,
321327
is_nullable: nulls_allowed.unwrap_or(true),
322328
};
323-
Ok(ColumnWithTable {
324-
schema_name,
325-
table_name,
326-
column,
327-
})
329+
Ok((
330+
ColumnWithTable {
331+
schema_name,
332+
table_name,
333+
column,
334+
},
335+
parent_column,
336+
))
328337
},
329338
)?
330339
.collect::<Result<Vec<_>, _>>()?;
331340

332-
Ok(columns)
341+
Ok(reconstruct_list_columns_with_table(raw_columns))
333342
}
334343

335344
fn list_all_files(&self, snapshot_id: i64) -> crate::Result<Vec<FileWithTable>> {

0 commit comments

Comments
 (0)