|
5 | 5 |
|
6 | 6 | use crate::{DuckLakeError, Result}; |
7 | 7 | use datafusion::datasource::object_store::ObjectStoreUrl; |
| 8 | +use percent_encoding::percent_decode_str; |
8 | 9 | use std::sync::Arc; |
9 | 10 |
|
10 | 11 | /// Parses a data path into an ObjectStoreUrl and key path |
@@ -37,9 +38,14 @@ use std::sync::Arc; |
37 | 38 | /// # Ok::<(), datafusion_ducklake::DuckLakeError>(()) |
38 | 39 | /// ``` |
39 | 40 | pub fn parse_object_store_url(data_path: &str) -> Result<(ObjectStoreUrl, String)> { |
40 | | - if data_path.starts_with("s3://") { |
| 41 | + // Trim leading/trailing whitespace (#64) |
| 42 | + let data_path = data_path.trim(); |
| 43 | + |
| 44 | + // Use case-insensitive scheme matching per RFC 3986 (#61) |
| 45 | + let lower = data_path.to_ascii_lowercase(); |
| 46 | + if lower.starts_with("s3://") { |
41 | 47 | parse_s3_url(data_path) |
42 | | - } else if data_path.starts_with("file://") { |
| 48 | + } else if lower.starts_with("file://") { |
43 | 49 | parse_file_url(data_path) |
44 | 50 | } else { |
45 | 51 | parse_local_path(data_path) |
@@ -73,7 +79,20 @@ fn parse_file_url(data_path: &str) -> Result<(ObjectStoreUrl, String)> { |
73 | 79 | DuckLakeError::InvalidConfig(format!("Failed to create ObjectStoreUrl: {}", e)) |
74 | 80 | })?; |
75 | 81 |
|
76 | | - Ok((object_store_url, url.path().to_owned())) |
| 82 | + // Decode percent-encoded characters to preserve non-ASCII in local filesystem paths (#60). |
| 83 | + // url::Url::parse() percent-encodes UTF-8 characters, but the local filesystem expects |
| 84 | + // raw UTF-8 paths. |
| 85 | + let decoded_path = percent_decode_str(url.path()) |
| 86 | + .decode_utf8() |
| 87 | + .map_err(|e| { |
| 88 | + DuckLakeError::InvalidConfig(format!( |
| 89 | + "Invalid UTF-8 in file path '{}': {}", |
| 90 | + data_path, e |
| 91 | + )) |
| 92 | + })? |
| 93 | + .to_string(); |
| 94 | + |
| 95 | + Ok((object_store_url, decoded_path)) |
77 | 96 | } |
78 | 97 |
|
79 | 98 | /// Parse a local filesystem path into ObjectStoreUrl and key path |
@@ -674,12 +693,76 @@ mod tests { |
674 | 693 |
|
675 | 694 | #[test] |
676 | 695 | fn test_s3_url_uppercase_scheme() { |
677 | | - // URL schemes are case-insensitive per RFC 3986 |
678 | | - // The url::Url crate normalizes to lowercase |
679 | | - let result = parse_object_store_url("S3://bucket/path"); |
680 | | - // This will fail because our code checks for lowercase "s3://" |
681 | | - // This documents current behavior - we expect lowercase |
682 | | - assert!(result.is_err()); |
| 696 | + // URL schemes are case-insensitive per RFC 3986 (#61) |
| 697 | + let (url, path) = parse_object_store_url("S3://bucket/path").unwrap(); |
| 698 | + assert_eq!(url, ObjectStoreUrl::parse("s3://bucket/").unwrap()); |
| 699 | + assert_eq!(path, "/path"); |
| 700 | + } |
| 701 | + |
| 702 | + #[test] |
| 703 | + fn test_s3_url_mixed_case_scheme() { |
| 704 | + let (url, path) = parse_object_store_url("s3://bucket/data").unwrap(); |
| 705 | + assert_eq!(url, ObjectStoreUrl::parse("s3://bucket/").unwrap()); |
| 706 | + assert_eq!(path, "/data"); |
| 707 | + } |
| 708 | + |
| 709 | + #[test] |
| 710 | + fn test_file_url_uppercase_scheme() { |
| 711 | + let (url, path) = parse_object_store_url("FILE:///tmp/data").unwrap(); |
| 712 | + assert_eq!(url, ObjectStoreUrl::parse("file:///").unwrap()); |
| 713 | + assert_eq!(path, "/tmp/data"); |
| 714 | + } |
| 715 | + |
| 716 | + #[test] |
| 717 | + fn test_file_url_mixed_case_scheme() { |
| 718 | + let (url, path) = parse_object_store_url("File:///tmp/data").unwrap(); |
| 719 | + assert_eq!(url, ObjectStoreUrl::parse("file:///").unwrap()); |
| 720 | + assert_eq!(path, "/tmp/data"); |
| 721 | + } |
| 722 | + |
| 723 | + #[test] |
| 724 | + fn test_whitespace_trimming_s3() { |
| 725 | + // Leading/trailing whitespace should be trimmed (#64) |
| 726 | + let (url, path) = parse_object_store_url(" s3://bucket/data ").unwrap(); |
| 727 | + assert_eq!(url, ObjectStoreUrl::parse("s3://bucket/").unwrap()); |
| 728 | + assert_eq!(path, "/data"); |
| 729 | + } |
| 730 | + |
| 731 | + #[test] |
| 732 | + fn test_whitespace_trimming_file() { |
| 733 | + let (url, path) = parse_object_store_url(" file:///tmp/data ").unwrap(); |
| 734 | + assert_eq!(url, ObjectStoreUrl::parse("file:///").unwrap()); |
| 735 | + assert_eq!(path, "/tmp/data"); |
| 736 | + } |
| 737 | + |
| 738 | + #[test] |
| 739 | + fn test_file_url_non_ascii_preserved() { |
| 740 | + // Non-ASCII characters in file:// URLs should be preserved (#60) |
| 741 | + let (url, path) = parse_object_store_url("file:///tmp/données/données").unwrap(); |
| 742 | + assert_eq!(url, ObjectStoreUrl::parse("file:///").unwrap()); |
| 743 | + assert_eq!(path, "/tmp/données/données"); |
| 744 | + } |
| 745 | + |
| 746 | + #[test] |
| 747 | + fn test_file_url_unicode_preserved() { |
| 748 | + let (url, path) = parse_object_store_url("file:///home/用户/数据").unwrap(); |
| 749 | + assert_eq!(url, ObjectStoreUrl::parse("file:///").unwrap()); |
| 750 | + assert_eq!(path, "/home/用户/数据"); |
| 751 | + } |
| 752 | + |
| 753 | + #[test] |
| 754 | + fn test_file_url_spaces_preserved() { |
| 755 | + let (url, path) = parse_object_store_url("file:///tmp/my data/file").unwrap(); |
| 756 | + assert_eq!(url, ObjectStoreUrl::parse("file:///").unwrap()); |
| 757 | + assert_eq!(path, "/tmp/my data/file"); |
| 758 | + } |
| 759 | + |
| 760 | + #[test] |
| 761 | + fn test_whitespace_trimming_with_uppercase_scheme() { |
| 762 | + // Combined: whitespace + uppercase scheme |
| 763 | + let (url, path) = parse_object_store_url(" S3://bucket/data ").unwrap(); |
| 764 | + assert_eq!(url, ObjectStoreUrl::parse("s3://bucket/").unwrap()); |
| 765 | + assert_eq!(path, "/data"); |
683 | 766 | } |
684 | 767 |
|
685 | 768 | #[test] |
|
0 commit comments