Skip to content

Commit 636f8ef

Browse files
committed
fix: preserve type precision in Arrow-to-DuckLake roundtrip
arrow_to_ducklake_type() previously mapped multiple Arrow types to the same DuckLake type string, losing precision on roundtrip: - Date64 -> "date" -> Date32 - LargeUtf8 -> "varchar" -> Utf8 - LargeBinary -> "blob" -> Binary - Time32(Second/Millisecond) -> "time" -> Time64(Microsecond) - Time64(Nanosecond) -> "time" -> Time64(Microsecond) Use distinct DuckLake type strings for each Arrow type: - Date64 -> "date64", LargeUtf8 -> "large_varchar", LargeBinary -> "large_blob", Time32(s) -> "time_s", Time32(ms) -> "time_ms", Time64(ns) -> "time_ns" Add corresponding mappings in ducklake_to_arrow_type() and expand the roundtrip test to cover all previously-lossy types. Closes #62
1 parent 22ea714 commit 636f8ef

1 file changed

Lines changed: 86 additions & 27 deletions

File tree

src/types.rs

Lines changed: 86 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,11 @@ pub fn ducklake_to_arrow_type(ducklake_type: &str) -> Result<DataType> {
4141

4242
// Temporal types
4343
"time" => Ok(DataType::Time64(TimeUnit::Microsecond)),
44+
"time_s" => Ok(DataType::Time32(TimeUnit::Second)),
45+
"time_ms" => Ok(DataType::Time32(TimeUnit::Millisecond)),
46+
"time_ns" => Ok(DataType::Time64(TimeUnit::Nanosecond)),
4447
"date" => Ok(DataType::Date32),
48+
"date64" => Ok(DataType::Date64),
4549
"timestamp" => Ok(DataType::Timestamp(TimeUnit::Microsecond, None)),
4650
"timestamptz" | "timestamp with time zone" => Ok(DataType::Timestamp(
4751
TimeUnit::Microsecond,
@@ -54,10 +58,12 @@ pub fn ducklake_to_arrow_type(ducklake_type: &str) -> Result<DataType> {
5458

5559
// String types
5660
"varchar" | "text" | "string" => Ok(DataType::Utf8),
61+
"large_varchar" => Ok(DataType::LargeUtf8),
5762
"json" => Ok(DataType::Utf8), // JSON stored as UTF8 string
5863

5964
// Binary types
6065
"blob" | "binary" | "bytea" => Ok(DataType::Binary),
66+
"large_blob" => Ok(DataType::LargeBinary),
6167
"uuid" => Ok(DataType::FixedSizeBinary(16)),
6268

6369
// Geometry types (stored as binary WKB format)
@@ -114,7 +120,12 @@ pub fn arrow_to_ducklake_type(arrow_type: &DataType) -> Result<String> {
114120
DataType::Float64 => Ok("float64".to_string()),
115121

116122
// Temporal types
117-
DataType::Date32 | DataType::Date64 => Ok("date".to_string()),
123+
DataType::Date32 => Ok("date".to_string()),
124+
DataType::Date64 => Ok("date64".to_string()),
125+
DataType::Time32(TimeUnit::Second) => Ok("time_s".to_string()),
126+
DataType::Time32(TimeUnit::Millisecond) => Ok("time_ms".to_string()),
127+
DataType::Time64(TimeUnit::Microsecond) => Ok("time".to_string()),
128+
DataType::Time64(TimeUnit::Nanosecond) => Ok("time_ns".to_string()),
118129
DataType::Time32(_) | DataType::Time64(_) => Ok("time".to_string()),
119130
DataType::Timestamp(TimeUnit::Second, None) => Ok("timestamp_s".to_string()),
120131
DataType::Timestamp(TimeUnit::Millisecond, None) => Ok("timestamp_ms".to_string()),
@@ -124,10 +135,12 @@ pub fn arrow_to_ducklake_type(arrow_type: &DataType) -> Result<String> {
124135
DataType::Interval(_) => Ok("interval".to_string()),
125136

126137
// String types
127-
DataType::Utf8 | DataType::LargeUtf8 => Ok("varchar".to_string()),
138+
DataType::Utf8 => Ok("varchar".to_string()),
139+
DataType::LargeUtf8 => Ok("large_varchar".to_string()),
128140

129141
// Binary types
130-
DataType::Binary | DataType::LargeBinary => Ok("blob".to_string()),
142+
DataType::Binary => Ok("blob".to_string()),
143+
DataType::LargeBinary => Ok("large_blob".to_string()),
131144
DataType::FixedSizeBinary(16) => Ok("uuid".to_string()),
132145
DataType::FixedSizeBinary(_) => Ok("blob".to_string()),
133146

@@ -275,35 +288,30 @@ mod tests {
275288

276289
#[test]
277290
fn test_build_read_schema_with_renamed_columns() {
278-
// Simulate: column was originally named "user_id", now renamed to "userId"
279291
let current_columns = vec![
280292
DuckLakeTableColumn {
281293
column_id: 1,
282-
column_name: "userId".to_string(), // Current name (renamed)
294+
column_name: "userId".to_string(),
283295
column_type: "int32".to_string(),
284296
is_nullable: true,
285297
},
286298
DuckLakeTableColumn {
287299
column_id: 2,
288-
column_name: "name".to_string(), // Not renamed
300+
column_name: "name".to_string(),
289301
column_type: "varchar".to_string(),
290302
is_nullable: true,
291303
},
292304
];
293305

294-
// Parquet file has original names
295306
let mut parquet_field_ids = HashMap::new();
296-
parquet_field_ids.insert(1, "user_id".to_string()); // Original name
297-
parquet_field_ids.insert(2, "name".to_string()); // Same name
307+
parquet_field_ids.insert(1, "user_id".to_string());
308+
parquet_field_ids.insert(2, "name".to_string());
298309

299310
let (read_schema, name_mapping) =
300311
build_read_schema_with_field_id_mapping(&current_columns, &parquet_field_ids).unwrap();
301312

302-
// Read schema should have original Parquet names
303313
assert_eq!(read_schema.field(0).name(), "user_id");
304314
assert_eq!(read_schema.field(1).name(), "name");
305-
306-
// Name mapping should map old name to new name
307315
assert_eq!(name_mapping.len(), 1);
308316
assert_eq!(name_mapping.get("user_id"), Some(&"userId".to_string()));
309317
}
@@ -318,31 +326,29 @@ mod tests {
318326
}];
319327

320328
let mut parquet_field_ids = HashMap::new();
321-
parquet_field_ids.insert(1, "id".to_string()); // Same name
329+
parquet_field_ids.insert(1, "id".to_string());
322330

323331
let (read_schema, name_mapping) =
324332
build_read_schema_with_field_id_mapping(&current_columns, &parquet_field_ids).unwrap();
325333

326334
assert_eq!(read_schema.field(0).name(), "id");
327-
assert!(name_mapping.is_empty()); // No rename needed
335+
assert!(name_mapping.is_empty());
328336
}
329337

330338
#[test]
331339
fn test_build_read_schema_no_field_ids() {
332-
// External file without field_ids
333340
let current_columns = vec![DuckLakeTableColumn {
334341
column_id: 1,
335342
column_name: "id".to_string(),
336343
column_type: "int32".to_string(),
337344
is_nullable: true,
338345
}];
339346

340-
let parquet_field_ids = HashMap::new(); // No field_ids in Parquet
347+
let parquet_field_ids = HashMap::new();
341348

342349
let (read_schema, name_mapping) =
343350
build_read_schema_with_field_id_mapping(&current_columns, &parquet_field_ids).unwrap();
344351

345-
// Falls back to current column name
346352
assert_eq!(read_schema.field(0).name(), "id");
347353
assert!(name_mapping.is_empty());
348354
}
@@ -363,6 +369,34 @@ mod tests {
363369
assert_eq!(ducklake_to_arrow_type("blob").unwrap(), DataType::Binary);
364370
}
365371

372+
#[test]
373+
fn test_precision_preserving_types() {
374+
assert_eq!(
375+
ducklake_to_arrow_type("date64").unwrap(),
376+
DataType::Date64
377+
);
378+
assert_eq!(
379+
ducklake_to_arrow_type("large_varchar").unwrap(),
380+
DataType::LargeUtf8
381+
);
382+
assert_eq!(
383+
ducklake_to_arrow_type("large_blob").unwrap(),
384+
DataType::LargeBinary
385+
);
386+
assert_eq!(
387+
ducklake_to_arrow_type("time_s").unwrap(),
388+
DataType::Time32(TimeUnit::Second)
389+
);
390+
assert_eq!(
391+
ducklake_to_arrow_type("time_ms").unwrap(),
392+
DataType::Time32(TimeUnit::Millisecond)
393+
);
394+
assert_eq!(
395+
ducklake_to_arrow_type("time_ns").unwrap(),
396+
DataType::Time64(TimeUnit::Nanosecond)
397+
);
398+
}
399+
366400
#[test]
367401
fn test_decimal_types() {
368402
assert_eq!(
@@ -386,7 +420,6 @@ mod tests {
386420

387421
#[test]
388422
fn test_unsupported_list_type_errors() {
389-
// Test list type returns error
390423
let result = ducklake_to_arrow_type("list<int32>");
391424
assert!(result.is_err());
392425
match result {
@@ -401,7 +434,6 @@ mod tests {
401434

402435
#[test]
403436
fn test_unsupported_array_type_errors() {
404-
// Test array type returns error
405437
let result = ducklake_to_arrow_type("array<varchar>");
406438
assert!(result.is_err());
407439
match result {
@@ -415,7 +447,6 @@ mod tests {
415447

416448
#[test]
417449
fn test_unsupported_struct_type_errors() {
418-
// Test struct type returns error
419450
let result = ducklake_to_arrow_type("struct<a:int32,b:varchar>");
420451
assert!(result.is_err());
421452
match result {
@@ -430,7 +461,6 @@ mod tests {
430461

431462
#[test]
432463
fn test_unsupported_map_type_errors() {
433-
// Test map type returns error
434464
let result = ducklake_to_arrow_type("map<varchar,int32>");
435465
assert!(result.is_err());
436466
match result {
@@ -445,7 +475,6 @@ mod tests {
445475

446476
#[test]
447477
fn test_nested_complex_types_error() {
448-
// Test nested complex types return error
449478
let result = ducklake_to_arrow_type("list<struct<a:int32,b:varchar>>");
450479
assert!(result.is_err());
451480
match result {
@@ -459,7 +488,6 @@ mod tests {
459488

460489
#[test]
461490
fn test_unknown_type_error() {
462-
// Test completely unknown types also return error
463491
let result = ducklake_to_arrow_type("completely_unknown_type");
464492
assert!(result.is_err());
465493
match result {
@@ -493,17 +521,40 @@ mod tests {
493521
"float64"
494522
);
495523
assert_eq!(arrow_to_ducklake_type(&DataType::Utf8).unwrap(), "varchar");
524+
assert_eq!(
525+
arrow_to_ducklake_type(&DataType::LargeUtf8).unwrap(),
526+
"large_varchar"
527+
);
496528
assert_eq!(arrow_to_ducklake_type(&DataType::Binary).unwrap(), "blob");
529+
assert_eq!(
530+
arrow_to_ducklake_type(&DataType::LargeBinary).unwrap(),
531+
"large_blob"
532+
);
497533
}
498534

499535
#[test]
500536
fn test_arrow_to_ducklake_temporal_types() {
501537
assert_eq!(arrow_to_ducklake_type(&DataType::Date32).unwrap(), "date");
502-
assert_eq!(arrow_to_ducklake_type(&DataType::Date64).unwrap(), "date");
538+
assert_eq!(
539+
arrow_to_ducklake_type(&DataType::Date64).unwrap(),
540+
"date64"
541+
);
542+
assert_eq!(
543+
arrow_to_ducklake_type(&DataType::Time32(TimeUnit::Second)).unwrap(),
544+
"time_s"
545+
);
546+
assert_eq!(
547+
arrow_to_ducklake_type(&DataType::Time32(TimeUnit::Millisecond)).unwrap(),
548+
"time_ms"
549+
);
503550
assert_eq!(
504551
arrow_to_ducklake_type(&DataType::Time64(TimeUnit::Microsecond)).unwrap(),
505552
"time"
506553
);
554+
assert_eq!(
555+
arrow_to_ducklake_type(&DataType::Time64(TimeUnit::Nanosecond)).unwrap(),
556+
"time_ns"
557+
);
507558
assert_eq!(
508559
arrow_to_ducklake_type(&DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(),
509560
"timestamp"
@@ -536,7 +587,6 @@ mod tests {
536587
arrow_to_ducklake_type(&DataType::FixedSizeBinary(16)).unwrap(),
537588
"uuid"
538589
);
539-
// Non-16 byte fixed size binary becomes blob
540590
assert_eq!(
541591
arrow_to_ducklake_type(&DataType::FixedSizeBinary(32)).unwrap(),
542592
"blob"
@@ -545,16 +595,26 @@ mod tests {
545595

546596
#[test]
547597
fn test_arrow_to_ducklake_roundtrip() {
548-
// Verify roundtrip: arrow -> ducklake -> arrow for common types
598+
// Verify roundtrip: arrow -> ducklake -> arrow for all supported types
549599
let test_types = vec![
550600
DataType::Boolean,
551601
DataType::Int32,
552602
DataType::Int64,
553603
DataType::Float64,
554604
DataType::Utf8,
605+
DataType::LargeUtf8,
555606
DataType::Binary,
607+
DataType::LargeBinary,
556608
DataType::Date32,
609+
DataType::Date64,
610+
DataType::Time32(TimeUnit::Second),
611+
DataType::Time32(TimeUnit::Millisecond),
612+
DataType::Time64(TimeUnit::Microsecond),
613+
DataType::Time64(TimeUnit::Nanosecond),
614+
DataType::Timestamp(TimeUnit::Second, None),
615+
DataType::Timestamp(TimeUnit::Millisecond, None),
557616
DataType::Timestamp(TimeUnit::Microsecond, None),
617+
DataType::Timestamp(TimeUnit::Nanosecond, None),
558618
DataType::Decimal128(10, 2),
559619
];
560620

@@ -595,7 +655,6 @@ mod tests {
595655

596656
#[test]
597657
fn test_build_schema_with_unsupported_type() {
598-
// Test that build_arrow_schema propagates complex type errors
599658
let columns = vec![
600659
DuckLakeTableColumn {
601660
column_id: 1,

0 commit comments

Comments
 (0)