11# ruff: noqa: I002
22# isort: dont-add-import: from __future__ import annotations
33
4+ import logging
45from typing import TYPE_CHECKING , Any , Union
56
67from daft import context , runners
78from daft .api_annotations import PublicAPI
89from daft .daft import IOConfig , ScanOperatorHandle , StorageConfig
910from daft .dataframe import DataFrame
11+ from daft .filesystem import get_protocol_from_path
1012from daft .io ._checkpoint import attach_checkpoint
1113from daft .logical .builder import LogicalPlanBuilder
1214
1618 from daft .checkpoint import CheckpointConfig
1719
1820
19- def _convert_iceberg_file_io_properties_to_io_config (props : dict [str , Any ]) -> IOConfig | None :
20- """Property keys defined here: https://github.com/apache/iceberg-python/blob/main/pyiceberg/io/__init__.py."""
21+ logger = logging .getLogger (__name__ )
22+
23+
24+ def _convert_iceberg_file_io_properties_to_io_config (
25+ props : dict [str , Any ], location : str | None = None
26+ ) -> IOConfig | None :
27+ """Property keys defined here: https://github.com/apache/iceberg-python/blob/main/pyiceberg/io/__init__.py.
28+
29+ For an ``oss://`` ``location`` (Alibaba Cloud OSS, S3-compatible), the IOConfig gets
30+ virtual-hosted addressing and an ``oss``->``s3`` alias so the S3 filesystem resolves
31+ ``oss://`` paths -- applied even with no IO properties (e.g. env-var credentials).
32+ """
2133 from daft .io import AzureConfig , GCSConfig , IOConfig , S3Config
2234
2335 any_props_set = False
@@ -30,13 +42,16 @@ def get_first_property_value(*property_names: str) -> Any | None:
3042 return property_value
3143 return None
3244
45+ is_oss = location is not None and get_protocol_from_path (location ) == "oss"
46+
3347 io_config = IOConfig (
3448 s3 = S3Config (
3549 endpoint_url = get_first_property_value ("s3.endpoint" ),
3650 region_name = get_first_property_value ("s3.region" , "client.region" ),
3751 key_id = get_first_property_value ("s3.access-key-id" , "client.access-key-id" ),
3852 access_key = get_first_property_value ("s3.secret-access-key" , "client.secret-access-key" ),
3953 session_token = get_first_property_value ("s3.session-token" , "client.session-token" ),
54+ force_virtual_addressing = True if is_oss else None ,
4055 ),
4156 azure = AzureConfig (
4257 storage_account = get_first_property_value ("adls.account-name" , "adlfs.account-name" ),
@@ -50,8 +65,12 @@ def get_first_property_value(*property_names: str) -> Any | None:
5065 project_id = get_first_property_value ("gcs.project-id" ),
5166 token = get_first_property_value ("gcs.oauth2.token" ),
5267 ),
68+ protocol_aliases = {"oss" : "s3" } if is_oss else None ,
5369 )
5470
71+ if is_oss :
72+ logger .debug ("oss:// table detected; applying S3-compatible settings to the IOConfig" )
73+ return io_config
5574 return io_config if any_props_set else None
5675
5776
@@ -107,7 +126,9 @@ def read_iceberg(
107126 table = StaticTable .from_metadata (metadata_location = table )
108127
109128 io_config = (
110- _convert_iceberg_file_io_properties_to_io_config (table .io .properties ) if io_config is None else io_config
129+ _convert_iceberg_file_io_properties_to_io_config (table .io .properties , table .location ())
130+ if io_config is None
131+ else io_config
111132 )
112133 io_config = context .get_context ().daft_planning_config .default_io_config if io_config is None else io_config
113134
0 commit comments