Skip to content

Commit 5e07133

Browse files
authored
Merge pull request #69 from hotdata-dev/fix/path-resolver-hardening
fix: harden path resolver for whitespace, case, and unicode
2 parents 22ea714 + 5bfadfd commit 5e07133

2 files changed

Lines changed: 93 additions & 9 deletions

File tree

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ arrow = "57"
2222
parquet = "57"
2323
object_store = { version = "0.12.4", features = ["aws"] }
2424
url = "2.5"
25+
percent-encoding = "2"
2526
async-trait = "0.1"
2627
tokio = { version = "1", features = ["full"] }
2728
futures = "0.3"

src/path_resolver.rs

Lines changed: 92 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
66
use crate::{DuckLakeError, Result};
77
use datafusion::datasource::object_store::ObjectStoreUrl;
8+
use percent_encoding::percent_decode_str;
89
use std::sync::Arc;
910

1011
/// Parses a data path into an ObjectStoreUrl and key path
@@ -37,9 +38,14 @@ use std::sync::Arc;
3738
/// # Ok::<(), datafusion_ducklake::DuckLakeError>(())
3839
/// ```
3940
pub fn parse_object_store_url(data_path: &str) -> Result<(ObjectStoreUrl, String)> {
40-
if data_path.starts_with("s3://") {
41+
// Trim leading/trailing whitespace (#64)
42+
let data_path = data_path.trim();
43+
44+
// Use case-insensitive scheme matching per RFC 3986 (#61)
45+
let lower = data_path.to_ascii_lowercase();
46+
if lower.starts_with("s3://") {
4147
parse_s3_url(data_path)
42-
} else if data_path.starts_with("file://") {
48+
} else if lower.starts_with("file://") {
4349
parse_file_url(data_path)
4450
} else {
4551
parse_local_path(data_path)
@@ -73,7 +79,20 @@ fn parse_file_url(data_path: &str) -> Result<(ObjectStoreUrl, String)> {
7379
DuckLakeError::InvalidConfig(format!("Failed to create ObjectStoreUrl: {}", e))
7480
})?;
7581

76-
Ok((object_store_url, url.path().to_owned()))
82+
// Decode percent-encoded characters to preserve non-ASCII in local filesystem paths (#60).
83+
// url::Url::parse() percent-encodes UTF-8 characters, but the local filesystem expects
84+
// raw UTF-8 paths.
85+
let decoded_path = percent_decode_str(url.path())
86+
.decode_utf8()
87+
.map_err(|e| {
88+
DuckLakeError::InvalidConfig(format!(
89+
"Invalid UTF-8 in file path '{}': {}",
90+
data_path, e
91+
))
92+
})?
93+
.to_string();
94+
95+
Ok((object_store_url, decoded_path))
7796
}
7897

7998
/// Parse a local filesystem path into ObjectStoreUrl and key path
@@ -674,12 +693,76 @@ mod tests {
674693

675694
#[test]
676695
fn test_s3_url_uppercase_scheme() {
677-
// URL schemes are case-insensitive per RFC 3986
678-
// The url::Url crate normalizes to lowercase
679-
let result = parse_object_store_url("S3://bucket/path");
680-
// This will fail because our code checks for lowercase "s3://"
681-
// This documents current behavior - we expect lowercase
682-
assert!(result.is_err());
696+
// URL schemes are case-insensitive per RFC 3986 (#61)
697+
let (url, path) = parse_object_store_url("S3://bucket/path").unwrap();
698+
assert_eq!(url, ObjectStoreUrl::parse("s3://bucket/").unwrap());
699+
assert_eq!(path, "/path");
700+
}
701+
702+
#[test]
703+
fn test_s3_url_mixed_case_scheme() {
704+
let (url, path) = parse_object_store_url("s3://bucket/data").unwrap();
705+
assert_eq!(url, ObjectStoreUrl::parse("s3://bucket/").unwrap());
706+
assert_eq!(path, "/data");
707+
}
708+
709+
#[test]
710+
fn test_file_url_uppercase_scheme() {
711+
let (url, path) = parse_object_store_url("FILE:///tmp/data").unwrap();
712+
assert_eq!(url, ObjectStoreUrl::parse("file:///").unwrap());
713+
assert_eq!(path, "/tmp/data");
714+
}
715+
716+
#[test]
717+
fn test_file_url_mixed_case_scheme() {
718+
let (url, path) = parse_object_store_url("File:///tmp/data").unwrap();
719+
assert_eq!(url, ObjectStoreUrl::parse("file:///").unwrap());
720+
assert_eq!(path, "/tmp/data");
721+
}
722+
723+
#[test]
724+
fn test_whitespace_trimming_s3() {
725+
// Leading/trailing whitespace should be trimmed (#64)
726+
let (url, path) = parse_object_store_url(" s3://bucket/data ").unwrap();
727+
assert_eq!(url, ObjectStoreUrl::parse("s3://bucket/").unwrap());
728+
assert_eq!(path, "/data");
729+
}
730+
731+
#[test]
732+
fn test_whitespace_trimming_file() {
733+
let (url, path) = parse_object_store_url(" file:///tmp/data ").unwrap();
734+
assert_eq!(url, ObjectStoreUrl::parse("file:///").unwrap());
735+
assert_eq!(path, "/tmp/data");
736+
}
737+
738+
#[test]
739+
fn test_file_url_non_ascii_preserved() {
740+
// Non-ASCII characters in file:// URLs should be preserved (#60)
741+
let (url, path) = parse_object_store_url("file:///tmp/données/données").unwrap();
742+
assert_eq!(url, ObjectStoreUrl::parse("file:///").unwrap());
743+
assert_eq!(path, "/tmp/données/données");
744+
}
745+
746+
#[test]
747+
fn test_file_url_unicode_preserved() {
748+
let (url, path) = parse_object_store_url("file:///home/用户/数据").unwrap();
749+
assert_eq!(url, ObjectStoreUrl::parse("file:///").unwrap());
750+
assert_eq!(path, "/home/用户/数据");
751+
}
752+
753+
#[test]
754+
fn test_file_url_spaces_preserved() {
755+
let (url, path) = parse_object_store_url("file:///tmp/my data/file").unwrap();
756+
assert_eq!(url, ObjectStoreUrl::parse("file:///").unwrap());
757+
assert_eq!(path, "/tmp/my data/file");
758+
}
759+
760+
#[test]
761+
fn test_whitespace_trimming_with_uppercase_scheme() {
762+
// Combined: whitespace + uppercase scheme
763+
let (url, path) = parse_object_store_url(" S3://bucket/data ").unwrap();
764+
assert_eq!(url, ObjectStoreUrl::parse("s3://bucket/").unwrap());
765+
assert_eq!(path, "/data");
683766
}
684767

685768
#[test]

0 commit comments

Comments
 (0)