From 58ba73c1eb3b44aa7da453ccde79def409ac2996 Mon Sep 17 00:00:00 2001 From: machichima Date: Sat, 28 Dec 2024 21:02:26 +0800 Subject: [PATCH 01/42] feat: enable obstore for minio in local successfully run it on local, not yet tested remote Signed-off-by: machichima --- flytekit/core/data_persistence.py | 90 +++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 17 deletions(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index 0640bc2eb5..5a6235c66c 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -25,13 +25,15 @@ import tempfile import typing from time import sleep -from typing import Any, Dict, Optional, Union, cast +from typing import Any, Dict, Optional, Tuple, Union, cast from uuid import UUID import fsspec from decorator import decorator from fsspec.asyn import AsyncFileSystem from fsspec.utils import get_protocol +from obstore.fsspec import AsyncFsspecStore +from obstore.store import S3Store from typing_extensions import Unpack from flytekit import configuration @@ -46,33 +48,78 @@ # Refer to https://github.com/fsspec/s3fs/blob/50bafe4d8766c3b2a4e1fc09669cf02fb2d71454/s3fs/core.py#L198 # for key and secret -_FSSPEC_S3_KEY_ID = "key" -_FSSPEC_S3_SECRET = "secret" +_FSSPEC_S3_KEY_ID = "access_key_id" +_FSSPEC_S3_SECRET = "secret_access_key" _ANON = "anon" Uploadable = typing.Union[str, os.PathLike, pathlib.Path, bytes, io.BufferedReader, io.BytesIO, io.StringIO] -def s3_setup_args(s3_cfg: configuration.S3Config, anonymous: bool = False) -> Dict[str, Any]: - kwargs: Dict[str, Any] = { - "cache_regions": True, - } +def s3_setup_args(s3_cfg: configuration.S3Config, bucket: str = "", anonymous: bool = False) -> Dict[str, Any]: + kwargs: Dict[str, Any] = {} + store_kwargs: Dict[str, Any] = {} + if s3_cfg.access_key_id: - kwargs[_FSSPEC_S3_KEY_ID] = s3_cfg.access_key_id + store_kwargs[_FSSPEC_S3_KEY_ID] = s3_cfg.access_key_id if s3_cfg.secret_access_key: - kwargs[_FSSPEC_S3_SECRET] = s3_cfg.secret_access_key + store_kwargs[_FSSPEC_S3_SECRET] = s3_cfg.secret_access_key # S3fs takes this as a special arg if s3_cfg.endpoint is not None: - kwargs["client_kwargs"] = {"endpoint_url": s3_cfg.endpoint} + store_kwargs["endpoint_url"] = s3_cfg.endpoint + # kwargs["client_kwargs"] = {"endpoint_url": s3_cfg.endpoint} + + store = S3Store.from_env( + bucket, + config={ + **store_kwargs, + "aws_allow_http": "true", # Allow HTTP connections + "aws_virtual_hosted_style_request": "false", # Use path-style addressing + }, + ) + + # if anonymous: + # kwargs[_ANON] = True - if anonymous: - kwargs[_ANON] = True + kwargs["store"] = store return kwargs +def split_path(path: str) -> Tuple[str, str]: + """ + Split bucket and file path + + Parameters + ---------- + path : string + Input path, like `s3://mybucket/path/to/file` + + Examples + -------- + >>> split_path("s3://mybucket/path/to/file") + ['mybucket', 'path/to/file'] + """ + if "file" in path: + # no bucket for file + return "", path + protocol = get_protocol(path) + if path.startswith(protocol + "://"): + path = path[len(protocol) + 3 :] + elif path.startswith(protocol + "::"): + path = path[len(protocol) + 2 :] + path = path.strip("/") + + if "/" not in path: + return path, "" + else: + path_li = path.split("/") + bucket = path_li[0] + file_path = "/".join(path_li[1:]) + return (bucket, file_path) + + def azure_setup_args(azure_cfg: configuration.AzureBlobStorageConfig, anonymous: bool = False) -> Dict[str, Any]: kwargs: Dict[str, Any] = {} @@ -189,6 +236,7 @@ def get_filesystem( protocol: typing.Optional[str] = None, anonymous: bool = False, path: typing.Optional[str] = None, + bucket: str = "", **kwargs, ) -> fsspec.AbstractFileSystem: if not protocol: @@ -197,13 +245,14 @@ def get_filesystem( kwargs["auto_mkdir"] = True return FlyteLocalFileSystem(**kwargs) elif protocol == "s3": - s3kwargs = s3_setup_args(self._data_config.s3, anonymous=anonymous) + s3kwargs = s3_setup_args(self._data_config.s3, bucket, anonymous=anonymous) s3kwargs.update(kwargs) return fsspec.filesystem(protocol, **s3kwargs) # type: ignore elif protocol == "gs": if anonymous: kwargs["token"] = _ANON return fsspec.filesystem(protocol, **kwargs) # type: ignore + # TODO: add azure elif protocol == "ftp": kwargs.update(fsspec.implementations.ftp.FTPFileSystem._get_kwargs_from_urls(path)) return fsspec.filesystem(protocol, **kwargs) @@ -216,12 +265,16 @@ def get_filesystem( return fsspec.filesystem(protocol, **kwargs) async def get_async_filesystem_for_path( - self, path: str = "", anonymous: bool = False, **kwargs + self, path: str = "", bucket: str = "", anonymous: bool = False, **kwargs ) -> Union[AsyncFileSystem, fsspec.AbstractFileSystem]: protocol = get_protocol(path) loop = asyncio.get_running_loop() - return self.get_filesystem(protocol, anonymous=anonymous, path=path, asynchronous=True, loop=loop, **kwargs) + # TODO: get bucket here? + + return self.get_filesystem( + protocol, anonymous=anonymous, path=path, bucket=bucket, asynchronous=True, loop=loop, **kwargs + ) def get_filesystem_for_path(self, path: str = "", anonymous: bool = False, **kwargs) -> fsspec.AbstractFileSystem: protocol = get_protocol(path) @@ -295,7 +348,8 @@ def exists(self, path: str) -> bool: @retry_request async def get(self, from_path: str, to_path: str, recursive: bool = False, **kwargs): - file_system = await self.get_async_filesystem_for_path(from_path) + bucket, from_path_file_only = split_path(from_path) + file_system = await self.get_async_filesystem_for_path(from_path, bucket) if recursive: from_path, to_path = self.recursive_paths(from_path, to_path) try: @@ -307,7 +361,7 @@ async def get(self, from_path: str, to_path: str, recursive: bool = False, **kwa ) logger.info(f"Getting {from_path} to {to_path}") if isinstance(file_system, AsyncFileSystem): - dst = await file_system._get(from_path, to_path, recursive=recursive, **kwargs) # pylint: disable=W0212 + dst = await file_system._get(from_path_file_only, to_path, recursive=recursive, **kwargs) # pylint: disable=W0212 else: dst = file_system.get(from_path, to_path, recursive=recursive, **kwargs) if isinstance(dst, (str, pathlib.Path)): @@ -635,6 +689,8 @@ async def async_put_data( put_data = loop_manager.synced(async_put_data) +fsspec.register_implementation("s3", AsyncFsspecStore) + flyte_tmp_dir = tempfile.mkdtemp(prefix="flyte-") default_local_file_access_provider = FileAccessProvider( local_sandbox_dir=os.path.join(flyte_tmp_dir, "sandbox"), From 79ea46d1e4c3d369422e3a98399ebb507df46d16 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 29 Dec 2024 16:22:31 +0800 Subject: [PATCH 02/42] feat: enable obstore write to remote minio s3 Signed-off-by: machichima --- flytekit/core/data_persistence.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index 5a6235c66c..ff934b4d6c 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -390,7 +390,8 @@ async def _put(self, from_path: str, to_path: str, recursive: bool = False, **kw More of an internal function to be called by put_data and put_raw_data This does not need a separate sync function. """ - file_system = await self.get_async_filesystem_for_path(to_path) + bucket, to_path_file_only = split_path(to_path) + file_system = await self.get_async_filesystem_for_path(to_path, bucket) from_path = self.strip_file_header(from_path) if recursive: # Only check this for the local filesystem @@ -408,7 +409,7 @@ async def _put(self, from_path: str, to_path: str, recursive: bool = False, **kw kwargs["metadata"] = {} kwargs["metadata"].update(self._execution_metadata) if isinstance(file_system, AsyncFileSystem): - dst = await file_system._put(from_path, to_path, recursive=recursive, **kwargs) # pylint: disable=W0212 + dst = await file_system._put(from_path, to_path_file_only, recursive=recursive, **kwargs) # pylint: disable=W0212 else: dst = file_system.put(from_path, to_path, recursive=recursive, **kwargs) if isinstance(dst, (str, pathlib.Path)): From caaa65790002b728b11275f0843acb4e8750569d Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 30 Dec 2024 23:18:14 +0800 Subject: [PATCH 03/42] feat: use obstore for gcs Signed-off-by: machichima --- flytekit/core/data_persistence.py | 52 ++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index ff934b4d6c..3e7427c226 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -33,7 +33,7 @@ from fsspec.asyn import AsyncFileSystem from fsspec.utils import get_protocol from obstore.fsspec import AsyncFsspecStore -from obstore.store import S3Store +from obstore.store import GCSStore, S3Store from typing_extensions import Unpack from flytekit import configuration @@ -79,8 +79,22 @@ def s3_setup_args(s3_cfg: configuration.S3Config, bucket: str = "", anonymous: b }, ) - # if anonymous: - # kwargs[_ANON] = True + if anonymous: + kwargs[_ANON] = True + + kwargs["store"] = store + + return kwargs + +def gs_setup_args(gcs_cfg: configuration.GCSConfig, bucket: str = "", anonymous: bool = False) -> Dict[str, Any]: + kwargs: Dict[str, Any] = {} + + store = GCSStore.from_env( + bucket, + ) + + if anonymous: + kwargs["token"] = _ANON kwargs["store"] = store @@ -116,8 +130,14 @@ def split_path(path: str) -> Tuple[str, str]: else: path_li = path.split("/") bucket = path_li[0] - file_path = "/".join(path_li[1:]) - return (bucket, file_path) + # use obstore for s3 and gcs only now, no need to split + # bucket out of path for other storage + support_types = ["s3", "gs"] + if protocol in support_types: + file_path = "/".join(path_li[1:]) + return (bucket, file_path) + else: + return bucket, path def azure_setup_args(azure_cfg: configuration.AzureBlobStorageConfig, anonymous: bool = False) -> Dict[str, Any]: @@ -239,6 +259,7 @@ def get_filesystem( bucket: str = "", **kwargs, ) -> fsspec.AbstractFileSystem: + # TODO: add bucket to adlfs if not protocol: return self._default_remote if protocol == "file": @@ -249,9 +270,9 @@ def get_filesystem( s3kwargs.update(kwargs) return fsspec.filesystem(protocol, **s3kwargs) # type: ignore elif protocol == "gs": - if anonymous: - kwargs["token"] = _ANON - return fsspec.filesystem(protocol, **kwargs) # type: ignore + gskwargs = gs_setup_args(self._data_config.gcs, bucket, anonymous=anonymous) + gskwargs.update(kwargs) + return fsspec.filesystem(protocol, **gskwargs) # type: ignore # TODO: add azure elif protocol == "ftp": kwargs.update(fsspec.implementations.ftp.FTPFileSystem._get_kwargs_from_urls(path)) @@ -270,15 +291,13 @@ async def get_async_filesystem_for_path( protocol = get_protocol(path) loop = asyncio.get_running_loop() - # TODO: get bucket here? - return self.get_filesystem( protocol, anonymous=anonymous, path=path, bucket=bucket, asynchronous=True, loop=loop, **kwargs ) - def get_filesystem_for_path(self, path: str = "", anonymous: bool = False, **kwargs) -> fsspec.AbstractFileSystem: + def get_filesystem_for_path(self, path: str = "", bucket: str = "", anonymous: bool = False, **kwargs) -> fsspec.AbstractFileSystem: protocol = get_protocol(path) - return self.get_filesystem(protocol, anonymous=anonymous, path=path, **kwargs) + return self.get_filesystem(protocol, anonymous=anonymous, path=path, bucket=bucket, **kwargs) @staticmethod def is_remote(path: Union[str, os.PathLike]) -> bool: @@ -478,11 +497,13 @@ async def async_put_raw_data( r = await self._put(from_path, to_path, **kwargs) return r or to_path + bucket, to_path_file_only = split_path(to_path) + # See https://github.com/fsspec/s3fs/issues/871 for more background and pending work on the fsspec side to # support effectively async open(). For now these use-cases below will revert to sync calls. # raw bytes if isinstance(lpath, bytes): - fs = self.get_filesystem_for_path(to_path) + fs = self.get_filesystem_for_path(to_path_file_only, bucket) with fs.open(to_path, "wb", **kwargs) as s: s.write(lpath) return to_path @@ -491,7 +512,7 @@ async def async_put_raw_data( if isinstance(lpath, io.BufferedReader) or isinstance(lpath, io.BytesIO): if not lpath.readable(): raise FlyteAssertion("Buffered reader must be readable") - fs = self.get_filesystem_for_path(to_path) + fs = self.get_filesystem_for_path(to_path_file_only, bucket) lpath.seek(0) with fs.open(to_path, "wb", **kwargs) as s: while data := lpath.read(read_chunk_size_bytes): @@ -501,7 +522,7 @@ async def async_put_raw_data( if isinstance(lpath, io.StringIO): if not lpath.readable(): raise FlyteAssertion("Buffered reader must be readable") - fs = self.get_filesystem_for_path(to_path) + fs = self.get_filesystem_for_path(to_path_file_only, bucket) lpath.seek(0) with fs.open(to_path, "wb", **kwargs) as s: while data_str := lpath.read(read_chunk_size_bytes): @@ -691,6 +712,7 @@ async def async_put_data( fsspec.register_implementation("s3", AsyncFsspecStore) +fsspec.register_implementation("gs", AsyncFsspecStore) flyte_tmp_dir = tempfile.mkdtemp(prefix="flyte-") default_local_file_access_provider = FileAccessProvider( From 7ba66e25f9957af9d53057bba38c28606c05d26f Mon Sep 17 00:00:00 2001 From: machichima Date: Wed, 1 Jan 2025 23:06:42 +0800 Subject: [PATCH 04/42] feat: use obstore for azure blob storage (abfs) Signed-off-by: machichima --- flytekit/core/data_persistence.py | 37 ++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index 3e7427c226..c65d2e87c9 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -33,7 +33,7 @@ from fsspec.asyn import AsyncFileSystem from fsspec.utils import get_protocol from obstore.fsspec import AsyncFsspecStore -from obstore.store import GCSStore, S3Store +from obstore.store import GCSStore, S3Store, AzureStore from typing_extensions import Unpack from flytekit import configuration @@ -132,7 +132,7 @@ def split_path(path: str) -> Tuple[str, str]: bucket = path_li[0] # use obstore for s3 and gcs only now, no need to split # bucket out of path for other storage - support_types = ["s3", "gs"] + support_types = ["s3", "gs", "abfs"] if protocol in support_types: file_path = "/".join(path_li[1:]) return (bucket, file_path) @@ -140,20 +140,33 @@ def split_path(path: str) -> Tuple[str, str]: return bucket, path -def azure_setup_args(azure_cfg: configuration.AzureBlobStorageConfig, anonymous: bool = False) -> Dict[str, Any]: +def azure_setup_args(azure_cfg: configuration.AzureBlobStorageConfig, container: str = "", anonymous: bool = False) -> Dict[str, Any]: kwargs: Dict[str, Any] = {} + store_kwargs: Dict[str, Any] = {} if azure_cfg.account_name: - kwargs["account_name"] = azure_cfg.account_name + store_kwargs["account_name"] = azure_cfg.account_name if azure_cfg.account_key: - kwargs["account_key"] = azure_cfg.account_key + store_kwargs["account_key"] = azure_cfg.account_key if azure_cfg.client_id: - kwargs["client_id"] = azure_cfg.client_id + store_kwargs["client_id"] = azure_cfg.client_id if azure_cfg.client_secret: - kwargs["client_secret"] = azure_cfg.client_secret + store_kwargs["client_secret"] = azure_cfg.client_secret if azure_cfg.tenant_id: - kwargs["tenant_id"] = azure_cfg.tenant_id - kwargs[_ANON] = anonymous + store_kwargs["tenant_id"] = azure_cfg.tenant_id + + store = AzureStore.from_env( + container, + config={ + **store_kwargs, + }, + ) + + kwargs["store"] = store + + if anonymous: + kwargs[_ANON] = True + return kwargs @@ -273,7 +286,10 @@ def get_filesystem( gskwargs = gs_setup_args(self._data_config.gcs, bucket, anonymous=anonymous) gskwargs.update(kwargs) return fsspec.filesystem(protocol, **gskwargs) # type: ignore - # TODO: add azure + elif protocol == "abfs": + azkwargs = azure_setup_args(self._data_config.azure, bucket, anonymous=anonymous) + azkwargs.update(kwargs) + return fsspec.filesystem(protocol, **azkwargs) # type: ignore elif protocol == "ftp": kwargs.update(fsspec.implementations.ftp.FTPFileSystem._get_kwargs_from_urls(path)) return fsspec.filesystem(protocol, **kwargs) @@ -713,6 +729,7 @@ async def async_put_data( fsspec.register_implementation("s3", AsyncFsspecStore) fsspec.register_implementation("gs", AsyncFsspecStore) +fsspec.register_implementation("abfs", AsyncFsspecStore) flyte_tmp_dir = tempfile.mkdtemp(prefix="flyte-") default_local_file_access_provider = FileAccessProvider( From 0ef7c05de0686cb95d0388d2b48f65ff73640458 Mon Sep 17 00:00:00 2001 From: machichima Date: Sat, 4 Jan 2025 22:15:50 +0800 Subject: [PATCH 05/42] fix: wrong file path for get_filesystem_for_path Signed-off-by: machichima --- flytekit/core/data_persistence.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index c65d2e87c9..a73fb8bd9d 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -33,7 +33,7 @@ from fsspec.asyn import AsyncFileSystem from fsspec.utils import get_protocol from obstore.fsspec import AsyncFsspecStore -from obstore.store import GCSStore, S3Store, AzureStore +from obstore.store import AzureStore, GCSStore, S3Store from typing_extensions import Unpack from flytekit import configuration @@ -86,6 +86,7 @@ def s3_setup_args(s3_cfg: configuration.S3Config, bucket: str = "", anonymous: b return kwargs + def gs_setup_args(gcs_cfg: configuration.GCSConfig, bucket: str = "", anonymous: bool = False) -> Dict[str, Any]: kwargs: Dict[str, Any] = {} @@ -140,7 +141,9 @@ def split_path(path: str) -> Tuple[str, str]: return bucket, path -def azure_setup_args(azure_cfg: configuration.AzureBlobStorageConfig, container: str = "", anonymous: bool = False) -> Dict[str, Any]: +def azure_setup_args( + azure_cfg: configuration.AzureBlobStorageConfig, container: str = "", anonymous: bool = False +) -> Dict[str, Any]: kwargs: Dict[str, Any] = {} store_kwargs: Dict[str, Any] = {} @@ -311,7 +314,9 @@ async def get_async_filesystem_for_path( protocol, anonymous=anonymous, path=path, bucket=bucket, asynchronous=True, loop=loop, **kwargs ) - def get_filesystem_for_path(self, path: str = "", bucket: str = "", anonymous: bool = False, **kwargs) -> fsspec.AbstractFileSystem: + def get_filesystem_for_path( + self, path: str = "", bucket: str = "", anonymous: bool = False, **kwargs + ) -> fsspec.AbstractFileSystem: protocol = get_protocol(path) return self.get_filesystem(protocol, anonymous=anonymous, path=path, bucket=bucket, **kwargs) @@ -513,13 +518,13 @@ async def async_put_raw_data( r = await self._put(from_path, to_path, **kwargs) return r or to_path - bucket, to_path_file_only = split_path(to_path) + bucket, _ = split_path(to_path) # See https://github.com/fsspec/s3fs/issues/871 for more background and pending work on the fsspec side to # support effectively async open(). For now these use-cases below will revert to sync calls. # raw bytes if isinstance(lpath, bytes): - fs = self.get_filesystem_for_path(to_path_file_only, bucket) + fs = self.get_filesystem_for_path(to_path, bucket) with fs.open(to_path, "wb", **kwargs) as s: s.write(lpath) return to_path @@ -528,7 +533,7 @@ async def async_put_raw_data( if isinstance(lpath, io.BufferedReader) or isinstance(lpath, io.BytesIO): if not lpath.readable(): raise FlyteAssertion("Buffered reader must be readable") - fs = self.get_filesystem_for_path(to_path_file_only, bucket) + fs = self.get_filesystem_for_path(to_path, bucket) lpath.seek(0) with fs.open(to_path, "wb", **kwargs) as s: while data := lpath.read(read_chunk_size_bytes): @@ -538,7 +543,7 @@ async def async_put_raw_data( if isinstance(lpath, io.StringIO): if not lpath.readable(): raise FlyteAssertion("Buffered reader must be readable") - fs = self.get_filesystem_for_path(to_path_file_only, bucket) + fs = self.get_filesystem_for_path(to_path, bucket) lpath.seek(0) with fs.open(to_path, "wb", **kwargs) as s: while data_str := lpath.read(read_chunk_size_bytes): From 17bde4a94f69b35b3dac1959357d5194ffa61f8c Mon Sep 17 00:00:00 2001 From: machichima Date: Sat, 4 Jan 2025 22:16:44 +0800 Subject: [PATCH 06/42] build(Dockerfile.dev): add obstore Signed-off-by: machichima --- Dockerfile.dev | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile.dev b/Dockerfile.dev index 1a839104e4..5d2b622f01 100644 --- a/Dockerfile.dev +++ b/Dockerfile.dev @@ -40,6 +40,7 @@ RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_FLYTEKIT=$PSEUDO_VERSION \ -e /flytekit \ -e /flytekit/plugins/flytekit-deck-standard \ -e /flytekit/plugins/flytekit-flyteinteractive \ + obstore==0.3.0b9 \ markdown \ pandas \ pillow \ From 353f000d152f964b1f5b6362c6eb628cae433ecf Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 5 Jan 2025 11:56:12 +0800 Subject: [PATCH 07/42] build(pyproject): add obstore Signed-off-by: machichima --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 3dc782c507..bf66abce2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ dependencies = [ "marshmallow-jsonschema>=0.12.0", "mashumaro>=3.15", "msgpack>=1.1.0", + "obstore==0.3.0b9", "protobuf!=4.25.0", "pygments", "python-json-logger>=2.0.0", From 7f0782aa130215a17861cda8e67a4c13ea539bef Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 5 Jan 2025 14:59:25 +0800 Subject: [PATCH 08/42] fix: add storage specific obstore class Specify the class properties for each file storage Signed-off-by: machichima --- flytekit/core/data_persistence.py | 8 +++--- flytekit/core/obstore_filesystem.py | 41 +++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 4 deletions(-) create mode 100644 flytekit/core/obstore_filesystem.py diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index a73fb8bd9d..128fb1c933 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -32,13 +32,13 @@ from decorator import decorator from fsspec.asyn import AsyncFileSystem from fsspec.utils import get_protocol -from obstore.fsspec import AsyncFsspecStore from obstore.store import AzureStore, GCSStore, S3Store from typing_extensions import Unpack from flytekit import configuration from flytekit.configuration import DataConfig from flytekit.core.local_fsspec import FlyteLocalFileSystem +from flytekit.core.obstore_filesystem import ObstoreAzureBlobFileSystem, ObstoreGCSFileSystem, ObstoreS3FileSystem from flytekit.core.utils import timeit from flytekit.exceptions.system import FlyteDownloadDataException, FlyteUploadDataException from flytekit.exceptions.user import FlyteAssertion, FlyteDataNotFoundException @@ -732,9 +732,9 @@ async def async_put_data( put_data = loop_manager.synced(async_put_data) -fsspec.register_implementation("s3", AsyncFsspecStore) -fsspec.register_implementation("gs", AsyncFsspecStore) -fsspec.register_implementation("abfs", AsyncFsspecStore) +fsspec.register_implementation("s3", ObstoreS3FileSystem) +fsspec.register_implementation("gs", ObstoreGCSFileSystem) +fsspec.register_implementation("abfs", ObstoreAzureBlobFileSystem) flyte_tmp_dir = tempfile.mkdtemp(prefix="flyte-") default_local_file_access_provider = FileAccessProvider( diff --git a/flytekit/core/obstore_filesystem.py b/flytekit/core/obstore_filesystem.py new file mode 100644 index 0000000000..047cf22b5d --- /dev/null +++ b/flytekit/core/obstore_filesystem.py @@ -0,0 +1,41 @@ +""" +Classes that overrides the AsyncFsspecStore that specify the filesystem specific parameters +""" + +from obstore.fsspec import AsyncFsspecStore + +DEFAULT_BLOCK_SIZE = 5 * 2**20 + + +class ObstoreS3FileSystem(AsyncFsspecStore): + """ + Add following property used in S3FileSystem + """ + + root_marker = "" + connect_timeout = 5 + retries = 5 + read_timeout = 15 + default_block_size = 5 * 2**20 + protocol = ("s3", "s3a") + _extra_tokenize_attributes = ("default_block_size",) + + +class ObstoreGCSFileSystem(AsyncFsspecStore): + """ + Add following property used in GCSFileSystem + """ + + scopes = {"read_only", "read_write", "full_control"} + retries = 6 # number of retries on http failure + default_block_size = DEFAULT_BLOCK_SIZE + protocol = "gcs", "gs" + async_impl = True + + +class ObstoreAzureBlobFileSystem(AsyncFsspecStore): + """ + Add following property used in AzureBlobFileSystem + """ + + protocol = "abfs" From 04bdf206caa75a152314a1773c02b53fe71b4048 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 5 Jan 2025 15:37:55 +0800 Subject: [PATCH 09/42] fix: path error for some file source not remove protocol from path other than s3, gs, and abfs Signed-off-by: machichima --- flytekit/core/data_persistence.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index 128fb1c933..89b68aed25 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -116,10 +116,12 @@ def split_path(path: str) -> Tuple[str, str]: >>> split_path("s3://mybucket/path/to/file") ['mybucket', 'path/to/file'] """ - if "file" in path: + support_types = ["s3", "gs", "abfs"] + protocol = get_protocol(path) + if protocol not in support_types: # no bucket for file return "", path - protocol = get_protocol(path) + if path.startswith(protocol + "://"): path = path[len(protocol) + 3 :] elif path.startswith(protocol + "::"): @@ -133,12 +135,8 @@ def split_path(path: str) -> Tuple[str, str]: bucket = path_li[0] # use obstore for s3 and gcs only now, no need to split # bucket out of path for other storage - support_types = ["s3", "gs", "abfs"] - if protocol in support_types: - file_path = "/".join(path_li[1:]) - return (bucket, file_path) - else: - return bucket, path + file_path = "/".join(path_li[1:]) + return (bucket, file_path) def azure_setup_args( From 01894199d64077adc200bf9bde954a3b1e458d34 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 5 Jan 2025 17:50:00 +0800 Subject: [PATCH 10/42] feat: enable setting retries for s3 Signed-off-by: machichima --- flytekit/core/data_persistence.py | 2 ++ flytekit/core/obstore_filesystem.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index 89b68aed25..498ecd95cf 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -79,6 +79,8 @@ def s3_setup_args(s3_cfg: configuration.S3Config, bucket: str = "", anonymous: b }, ) + kwargs["retries"] = s3_cfg.retries + if anonymous: kwargs[_ANON] = True diff --git a/flytekit/core/obstore_filesystem.py b/flytekit/core/obstore_filesystem.py index 047cf22b5d..26ee3dec58 100644 --- a/flytekit/core/obstore_filesystem.py +++ b/flytekit/core/obstore_filesystem.py @@ -2,6 +2,8 @@ Classes that overrides the AsyncFsspecStore that specify the filesystem specific parameters """ +from typing import Optional + from obstore.fsspec import AsyncFsspecStore DEFAULT_BLOCK_SIZE = 5 * 2**20 @@ -20,6 +22,19 @@ class ObstoreS3FileSystem(AsyncFsspecStore): protocol = ("s3", "s3a") _extra_tokenize_attributes = ("default_block_size",) + def __init__(self, retries: Optional[int] = None, **kwargs): + """ + Initialize the ObstoreS3FileSystem with optional retries. + + Args: + retries (int): Number of retry for requests + **kwargs: Other keyword arguments passed to the parent class + """ + if retries is not None: + self.retries = retries + + super().__init__(**kwargs) + class ObstoreGCSFileSystem(AsyncFsspecStore): """ From deb9f3d0a339b8eafb60287046146e8dfb23fa26 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 5 Jan 2025 17:50:40 +0800 Subject: [PATCH 11/42] test: modify test for obstore s3 Signed-off-by: machichima --- tests/flytekit/unit/core/test_flyte_directory.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/flytekit/unit/core/test_flyte_directory.py b/tests/flytekit/unit/core/test_flyte_directory.py index be61388fa5..b72ec1aed9 100644 --- a/tests/flytekit/unit/core/test_flyte_directory.py +++ b/tests/flytekit/unit/core/test_flyte_directory.py @@ -318,11 +318,11 @@ def test_directory_guess(): assert fft.extension() == "" -@mock.patch("s3fs.core.S3FileSystem._lsdir") +@mock.patch("flytekit.core.obstore_filesystem.ObstoreS3FileSystem._ls") @mock.patch("flytekit.core.data_persistence.FileAccessProvider.get_data") -def test_list_dir(mock_get_data, mock_lsdir): +def test_list_dir(mock_get_data, mock_ls): remote_dir = "s3://test-flytedir" - mock_lsdir.return_value = [ + mock_ls.return_value = [ {"name": os.path.join(remote_dir, "file1.txt"), "type": "file"}, {"name": os.path.join(remote_dir, "file2.txt"), "type": "file"}, {"name": os.path.join(remote_dir, "subdir"), "type": "directory"}, From 0ca6dbc15af448b191ef7eb97389e92761dafa8b Mon Sep 17 00:00:00 2001 From: machichima Date: Wed, 8 Jan 2025 21:52:29 +0800 Subject: [PATCH 12/42] test: use mock patch for obstore test assert if the specific function is called with provided parameters Signed-off-by: machichima --- tests/flytekit/unit/core/test_data.py | 225 ++++++++++++++---- .../unit/core/test_data_persistence.py | 53 +++-- 2 files changed, 215 insertions(+), 63 deletions(-) diff --git a/tests/flytekit/unit/core/test_data.py b/tests/flytekit/unit/core/test_data.py index 42e74f453c..0d16330e3d 100644 --- a/tests/flytekit/unit/core/test_data.py +++ b/tests/flytekit/unit/core/test_data.py @@ -5,14 +5,20 @@ from uuid import UUID import typing import asyncio +from botocore.parsers import base64 import fsspec import mock +from obstore.store import S3Store import pytest from s3fs import S3FileSystem from flytekit.configuration import Config, DataConfig, S3Config from flytekit.core.context_manager import FlyteContextManager, FlyteContext -from flytekit.core.data_persistence import FileAccessProvider, get_fsspec_storage_options, s3_setup_args +from flytekit.core.data_persistence import ( + FileAccessProvider, + get_fsspec_storage_options, + s3_setup_args, +) from flytekit.core.type_engine import TypeEngine from flytekit.types.directory.types import FlyteDirectory from flytekit.types.file import FlyteFile @@ -32,15 +38,21 @@ def test_path_getting(mock_uuid_class, mock_gcs): # Testing with raw output prefix pointing to a local path loc_sandbox = os.path.join(root, "tmp", "unittest") loc_data = os.path.join(root, "tmp", "unittestdata") - local_raw_fp = FileAccessProvider(local_sandbox_dir=loc_sandbox, raw_output_prefix=loc_data) + local_raw_fp = FileAccessProvider( + local_sandbox_dir=loc_sandbox, raw_output_prefix=loc_data + ) r = local_raw_fp.get_random_string() rr = local_raw_fp.join(local_raw_fp.raw_output_prefix, r) assert rr == os.path.join(root, "tmp", "unittestdata", "abcdef123") - rr = local_raw_fp.join(local_raw_fp.raw_output_prefix, r, local_raw_fp.get_file_tail("/fsa/blah.csv")) + rr = local_raw_fp.join( + local_raw_fp.raw_output_prefix, r, local_raw_fp.get_file_tail("/fsa/blah.csv") + ) assert rr == os.path.join(root, "tmp", "unittestdata", "abcdef123", "blah.csv") # Test local path and directory - assert local_raw_fp.get_random_local_path() == os.path.join(root, "tmp", "unittest", "local_flytekit", "abcdef123") + assert local_raw_fp.get_random_local_path() == os.path.join( + root, "tmp", "unittest", "local_flytekit", "abcdef123" + ) assert local_raw_fp.get_random_local_path("xjiosa/blah.txt") == os.path.join( root, "tmp", "unittest", "local_flytekit", "abcdef123", "blah.txt" ) @@ -49,20 +61,28 @@ def test_path_getting(mock_uuid_class, mock_gcs): ) # Recursive paths - assert "file:///abc/happy/", "s3://my-s3-bucket/bucket1/" == local_raw_fp.recursive_paths( + assert ( + "file:///abc/happy/" + ), "s3://my-s3-bucket/bucket1/" == local_raw_fp.recursive_paths( "file:///abc/happy/", "s3://my-s3-bucket/bucket1/" ) - assert "file:///abc/happy/", "s3://my-s3-bucket/bucket1/" == local_raw_fp.recursive_paths( + assert ( + "file:///abc/happy/" + ), "s3://my-s3-bucket/bucket1/" == local_raw_fp.recursive_paths( "file:///abc/happy", "s3://my-s3-bucket/bucket1" ) # Test with remote pointed to s3. - s3_fa = FileAccessProvider(local_sandbox_dir=loc_sandbox, raw_output_prefix="s3://my-s3-bucket") + s3_fa = FileAccessProvider( + local_sandbox_dir=loc_sandbox, raw_output_prefix="s3://my-s3-bucket" + ) r = s3_fa.get_random_string() rr = s3_fa.join(s3_fa.raw_output_prefix, r) assert rr == "s3://my-s3-bucket/abcdef123" # trailing slash should make no difference - s3_fa = FileAccessProvider(local_sandbox_dir=loc_sandbox, raw_output_prefix="s3://my-s3-bucket/") + s3_fa = FileAccessProvider( + local_sandbox_dir=loc_sandbox, raw_output_prefix="s3://my-s3-bucket/" + ) r = s3_fa.get_random_string() rr = s3_fa.join(s3_fa.raw_output_prefix, r) assert rr == "s3://my-s3-bucket/abcdef123" @@ -70,17 +90,23 @@ def test_path_getting(mock_uuid_class, mock_gcs): # Testing with raw output prefix pointing to file:// # Skip tests for windows if os.name != "nt": - file_raw_fp = FileAccessProvider(local_sandbox_dir=loc_sandbox, raw_output_prefix="file:///tmp/unittestdata") + file_raw_fp = FileAccessProvider( + local_sandbox_dir=loc_sandbox, raw_output_prefix="file:///tmp/unittestdata" + ) r = file_raw_fp.get_random_string() rr = file_raw_fp.join(file_raw_fp.raw_output_prefix, r) rr = file_raw_fp.strip_file_header(rr) assert rr == os.path.join(root, "tmp", "unittestdata", "abcdef123") r = file_raw_fp.get_random_string() - rr = file_raw_fp.join(file_raw_fp.raw_output_prefix, r, file_raw_fp.get_file_tail("/fsa/blah.csv")) + rr = file_raw_fp.join( + file_raw_fp.raw_output_prefix, r, file_raw_fp.get_file_tail("/fsa/blah.csv") + ) rr = file_raw_fp.strip_file_header(rr) assert rr == os.path.join(root, "tmp", "unittestdata", "abcdef123", "blah.csv") - g_fa = FileAccessProvider(local_sandbox_dir=loc_sandbox, raw_output_prefix="gs://my-s3-bucket/") + g_fa = FileAccessProvider( + local_sandbox_dir=loc_sandbox, raw_output_prefix="gs://my-s3-bucket/" + ) r = g_fa.get_random_string() rr = g_fa.join(g_fa.raw_output_prefix, r) assert rr == "gs://my-s3-bucket/abcdef123" @@ -119,7 +145,11 @@ async def test_local_provider(source_folder): # dest folder exists. dc = Config.for_sandbox().data_config with tempfile.TemporaryDirectory() as dest_tmpdir: - provider = FileAccessProvider(local_sandbox_dir="/tmp/unittest", raw_output_prefix=dest_tmpdir, data_config=dc) + provider = FileAccessProvider( + local_sandbox_dir="/tmp/unittest", + raw_output_prefix=dest_tmpdir, + data_config=dc, + ) r = provider.get_random_string() doesnotexist = provider.join(provider.raw_output_prefix, r) await provider.async_put_data(source_folder, doesnotexist, is_multipart=True) @@ -176,9 +206,13 @@ def test_s3_provider(source_folder): # Running mkdir on s3 filesystem doesn't do anything so leaving out for now dc = Config.for_sandbox().data_config provider = FileAccessProvider( - local_sandbox_dir="/tmp/unittest", raw_output_prefix="s3://my-s3-bucket/testdata/", data_config=dc + local_sandbox_dir="/tmp/unittest", + raw_output_prefix="s3://my-s3-bucket/testdata/", + data_config=dc, + ) + doesnotexist = provider.join( + provider.raw_output_prefix, provider.get_random_string() ) - doesnotexist = provider.join(provider.raw_output_prefix, provider.get_random_string()) provider.put_data(source_folder, doesnotexist, is_multipart=True) fs = provider.get_filesystem_for_path(doesnotexist) files = fs.find(doesnotexist) @@ -190,7 +224,9 @@ def test_local_provider_get_empty(): with tempfile.TemporaryDirectory() as empty_source: with tempfile.TemporaryDirectory() as dest_folder: provider = FileAccessProvider( - local_sandbox_dir="/tmp/unittest", raw_output_prefix=empty_source, data_config=dc + local_sandbox_dir="/tmp/unittest", + raw_output_prefix=empty_source, + data_config=dc, ) provider.get_data(empty_source, dest_folder, is_multipart=True) loc = provider.get_filesystem_for_path(dest_folder) @@ -202,18 +238,29 @@ def test_local_provider_get_empty(): @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -def test_s3_setup_args_env_empty(mock_os, mock_get_config_file): +@mock.patch("obstore.store.S3Store.from_env") +def test_s3_setup_args_env_empty(mock_from_env, mock_os, mock_get_config_file): mock_get_config_file.return_value = None mock_os.get.return_value = None s3c = S3Config.auto() kwargs = s3_setup_args(s3c) - assert kwargs == {"cache_regions": True} + + mock_from_env.return_value = mock.Mock() + mock_from_env.assert_called_with( + "", + config={ + "aws_allow_http": "true", # Allow HTTP connections + "aws_virtual_hosted_style_request": "false", # Use path-style addressing + }, + ) @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -def test_s3_setup_args_env_both(mock_os, mock_get_config_file): +@mock.patch("obstore.store.S3Store.from_env") +def test_s3_setup_args_env_both(mock_from_env, mock_os, mock_get_config_file): mock_get_config_file.return_value = None + ee = { "AWS_ACCESS_KEY_ID": "ignore-user", "AWS_SECRET_ACCESS_KEY": "ignore-secret", @@ -222,12 +269,23 @@ def test_s3_setup_args_env_both(mock_os, mock_get_config_file): } mock_os.get.side_effect = lambda x, y: ee.get(x) kwargs = s3_setup_args(S3Config.auto()) - assert kwargs == {"key": "flyte", "secret": "flyte-secret", "cache_regions": True} + + mock_from_env.return_value = mock.Mock() + mock_from_env.assert_called_with( + "", + config={ + "access_key_id": "flyte", + "secret_access_key": "flyte-secret", + "aws_allow_http": "true", # Allow HTTP connections + "aws_virtual_hosted_style_request": "false", # Use path-style addressing + }, + ) @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -def test_s3_setup_args_env_flyte(mock_os, mock_get_config_file): +@mock.patch("obstore.store.S3Store.from_env") +def test_s3_setup_args_env_flyte(mock_from_env, mock_os, mock_get_config_file): mock_get_config_file.return_value = None ee = { "FLYTE_AWS_ACCESS_KEY_ID": "flyte", @@ -235,12 +293,23 @@ def test_s3_setup_args_env_flyte(mock_os, mock_get_config_file): } mock_os.get.side_effect = lambda x, y: ee.get(x) kwargs = s3_setup_args(S3Config.auto()) - assert kwargs == {"key": "flyte", "secret": "flyte-secret", "cache_regions": True} + + mock_from_env.return_value = mock.Mock() + mock_from_env.assert_called_with( + "", + config={ + "access_key_id": "flyte", + "secret_access_key": "flyte-secret", + "aws_allow_http": "true", # Allow HTTP connections + "aws_virtual_hosted_style_request": "false", # Use path-style addressing + }, + ) @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -def test_s3_setup_args_env_aws(mock_os, mock_get_config_file): +@mock.patch("obstore.store.S3Store.from_env") +def test_s3_setup_args_env_aws(mock_from_env, mock_os, mock_get_config_file): mock_get_config_file.return_value = None ee = { "AWS_ACCESS_KEY_ID": "ignore-user", @@ -248,8 +317,15 @@ def test_s3_setup_args_env_aws(mock_os, mock_get_config_file): } mock_os.get.side_effect = lambda x, y: ee.get(x) kwargs = s3_setup_args(S3Config.auto()) - # not explicitly in kwargs, since fsspec/boto3 will use these env vars by default - assert kwargs == {"cache_regions": True} + + mock_from_env.return_value = mock.Mock() + mock_from_env.assert_called_with( + "", + config={ + "aws_allow_http": "true", # Allow HTTP connections + "aws_virtual_hosted_style_request": "false", # Use path-style addressing + }, + ) @mock.patch("flytekit.configuration.get_config_file") @@ -272,31 +348,42 @@ def test_get_fsspec_storage_options_gcs_with_overrides(mock_os, mock_get_config_ "FLYTE_GCP_GSUTIL_PARALLELISM": "False", } mock_os.get.side_effect = lambda x, y: ee.get(x) - storage_options = get_fsspec_storage_options("gs", DataConfig.auto(), anonymous=True, other_argument="value") + storage_options = get_fsspec_storage_options( + "gs", DataConfig.auto(), anonymous=True, other_argument="value" + ) assert storage_options == {"token": "anon", "other_argument": "value"} @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -def test_get_fsspec_storage_options_azure(mock_os, mock_get_config_file): +@mock.patch("obstore.store.AzureStore.from_env") +def test_get_fsspec_storage_options_azure(mock_from_env, mock_os, mock_get_config_file): mock_get_config_file.return_value = None + account_key = "accountkey" + + account_key_base64 = base64.b64encode(account_key.encode()).decode() + ee = { "FLYTE_AZURE_STORAGE_ACCOUNT_NAME": "accountname", - "FLYTE_AZURE_STORAGE_ACCOUNT_KEY": "accountkey", + "FLYTE_AZURE_STORAGE_ACCOUNT_KEY": account_key_base64, "FLYTE_AZURE_TENANT_ID": "tenantid", "FLYTE_AZURE_CLIENT_ID": "clientid", "FLYTE_AZURE_CLIENT_SECRET": "clientsecret", } mock_os.get.side_effect = lambda x, y: ee.get(x) storage_options = get_fsspec_storage_options("abfs", DataConfig.auto()) - assert storage_options == { - "account_name": "accountname", - "account_key": "accountkey", - "client_id": "clientid", - "client_secret": "clientsecret", - "tenant_id": "tenantid", - "anon": False, - } + + mock_from_env.return_value = mock.Mock() + mock_from_env.assert_called_with( + "", + config={ + "account_name": "accountname", + "account_key": account_key_base64, + "client_id": "clientid", + "client_secret": "clientsecret", + "tenant_id": "tenantid", + }, + ) @mock.patch("flytekit.configuration.get_config_file") @@ -352,8 +439,14 @@ def test_crawl_local_non_nt(source_folder): res = fd.crawl() split = [(x, y) for x, y in res] files = [os.path.join(x, y) for x, y in split] - assert set(split) == {(source_folder, "original.txt"), (source_folder, os.path.join("nested", "more.txt"))} - expected = {os.path.join(source_folder, "original.txt"), os.path.join(source_folder, "nested", "more.txt")} + assert set(split) == { + (source_folder, "original.txt"), + (source_folder, os.path.join("nested", "more.txt")), + } + expected = { + os.path.join(source_folder, "original.txt"), + os.path.join(source_folder, "nested", "more.txt"), + } assert set(files) == expected # Test crawling a directory without trailing / or \ @@ -379,12 +472,19 @@ def test_crawl_s3(source_folder): # Running mkdir on s3 filesystem doesn't do anything so leaving out for now dc = Config.for_sandbox().data_config provider = FileAccessProvider( - local_sandbox_dir="/tmp/unittest", raw_output_prefix="s3://my-s3-bucket/testdata/", data_config=dc + local_sandbox_dir="/tmp/unittest", + raw_output_prefix="s3://my-s3-bucket/testdata/", + data_config=dc, + ) + s3_random_target = provider.join( + provider.raw_output_prefix, provider.get_random_string() ) - s3_random_target = provider.join(provider.raw_output_prefix, provider.get_random_string()) provider.put_data(source_folder, s3_random_target, is_multipart=True) ctx = FlyteContextManager.current_context() - expected = {f"{s3_random_target}/original.txt", f"{s3_random_target}/nested/more.txt"} + expected = { + f"{s3_random_target}/original.txt", + f"{s3_random_target}/nested/more.txt", + } with FlyteContextManager.with_context(ctx.with_file_access(provider)): fd = FlyteDirectory(path=s3_random_target) @@ -392,7 +492,10 @@ def test_crawl_s3(source_folder): res = [(x, y) for x, y in res] files = [os.path.join(x, y) for x, y in res] assert set(files) == expected - assert set(res) == {(s3_random_target, "original.txt"), (s3_random_target, os.path.join("nested", "more.txt"))} + assert set(res) == { + (s3_random_target, "original.txt"), + (s3_random_target, os.path.join("nested", "more.txt")), + } fd_file = FlyteDirectory(path=f"{s3_random_target}/original.txt") res = fd_file.crawl() @@ -405,7 +508,11 @@ def test_walk_local_copy_to_s3(source_folder): dc = Config.for_sandbox().data_config explicit_empty_folder = UUID(int=random.getrandbits(128)).hex raw_output_path = f"s3://my-s3-bucket/testdata/{explicit_empty_folder}" - provider = FileAccessProvider(local_sandbox_dir="/tmp/unittest", raw_output_prefix=raw_output_path, data_config=dc) + provider = FileAccessProvider( + local_sandbox_dir="/tmp/unittest", + raw_output_prefix=raw_output_path, + data_config=dc, + ) ctx = FlyteContextManager.current_context() local_fd = FlyteDirectory(path=source_folder) @@ -433,7 +540,9 @@ def test_s3_metadata(): dc = Config.for_sandbox().data_config random_folder = UUID(int=random.getrandbits(64)).hex raw_output = f"s3://my-s3-bucket/testing/metadata_test/{random_folder}" - provider = FileAccessProvider(local_sandbox_dir="/tmp/unittest", raw_output_prefix=raw_output, data_config=dc) + provider = FileAccessProvider( + local_sandbox_dir="/tmp/unittest", raw_output_prefix=raw_output, data_config=dc + ) _, local_zip = tempfile.mkstemp(suffix=".gz") with open(local_zip, "w") as f: f.write("hello world") @@ -454,7 +563,9 @@ def test_s3_metadata(): assert len(files) == 2 -async def dummy_output_to_literal_map(ctx: FlyteContext, ff: typing.List[FlyteFile]) -> Literal: +async def dummy_output_to_literal_map( + ctx: FlyteContext, ff: typing.List[FlyteFile] +) -> Literal: lt = TypeEngine.to_literal_type(typing.List[FlyteFile]) lit = await TypeEngine.async_to_literal(ctx, ff, typing.List[FlyteFile], lt) return lit @@ -479,7 +590,9 @@ def test_async_local_copy_to_s3(): random_folder = UUID(int=random.getrandbits(64)).hex raw_output = f"s3://my-s3-bucket/testing/upload_test/{random_folder}" print(f"Uploading to {raw_output}") - provider = FileAccessProvider(local_sandbox_dir="/tmp/unittest", raw_output_prefix=raw_output, data_config=dc) + provider = FileAccessProvider( + local_sandbox_dir="/tmp/unittest", raw_output_prefix=raw_output, data_config=dc + ) start_time = datetime.datetime.now(datetime.timezone.utc) start_wall_time = time.perf_counter() @@ -522,10 +635,17 @@ def test_async_download_from_s3(): random_folder = UUID(int=random.getrandbits(64)).hex raw_output = f"s3://my-s3-bucket/testing/upload_test/{random_folder}" print(f"Uploading to {raw_output}") - provider = FileAccessProvider(local_sandbox_dir="/tmp/unittest", raw_output_prefix=raw_output, data_config=dc) + provider = FileAccessProvider( + local_sandbox_dir="/tmp/unittest", raw_output_prefix=raw_output, data_config=dc + ) with FlyteContextManager.with_context(ctx.with_file_access(provider)) as ctx: - lit = TypeEngine.to_literal(ctx, ff, typing.List[FlyteFile], TypeEngine.to_literal_type(typing.List[FlyteFile])) + lit = TypeEngine.to_literal( + ctx, + ff, + typing.List[FlyteFile], + TypeEngine.to_literal_type(typing.List[FlyteFile]), + ) print(f"Literal is {lit}") python_list = TypeEngine.to_python_value(ctx, lit, typing.List[FlyteFile]) @@ -545,10 +665,17 @@ def test_async_download_from_s3(): print(f"Time taken (serial download): {end_time - start_time}") print(f"Wall time taken (serial download): {end_wall_time - start_wall_time}") - print(f"Process time taken (serial download): {end_process_time - start_process_time}") + print( + f"Process time taken (serial download): {end_process_time - start_process_time}" + ) with FlyteContextManager.with_context(ctx.with_file_access(provider)) as ctx: - lit = TypeEngine.to_literal(ctx, ff, typing.List[FlyteFile], TypeEngine.to_literal_type(typing.List[FlyteFile])) + lit = TypeEngine.to_literal( + ctx, + ff, + typing.List[FlyteFile], + TypeEngine.to_literal_type(typing.List[FlyteFile]), + ) print(f"Literal is {lit}") python_list = TypeEngine.to_python_value(ctx, lit, typing.List[FlyteFile]) diff --git a/tests/flytekit/unit/core/test_data_persistence.py b/tests/flytekit/unit/core/test_data_persistence.py index 116717b92d..776c2df01a 100644 --- a/tests/flytekit/unit/core/test_data_persistence.py +++ b/tests/flytekit/unit/core/test_data_persistence.py @@ -10,6 +10,7 @@ import mock import pytest from azure.identity import ClientSecretCredential, DefaultAzureCredential +from botocore.parsers import base64 from flytekit.configuration import Config from flytekit.core.data_persistence import FileAccessProvider @@ -153,18 +154,29 @@ def test_generate_new_custom_path(): assert np == "s3://foo-bucket/my-default-prefix/bar.txt" -def test_initialise_azure_file_provider_with_account_key(): +@mock.patch("obstore.store.AzureStore.from_env") +def test_initialise_azure_file_provider_with_account_key(mock_from_env): + account_key = "accountkey" + account_key_base64 = base64.b64encode(account_key.encode()).decode() + with mock.patch.dict( os.environ, - {"FLYTE_AZURE_STORAGE_ACCOUNT_NAME": "accountname", "FLYTE_AZURE_STORAGE_ACCOUNT_KEY": "accountkey"}, + {"FLYTE_AZURE_STORAGE_ACCOUNT_NAME": "accountname", "FLYTE_AZURE_STORAGE_ACCOUNT_KEY": account_key_base64}, ): fp = FileAccessProvider("/tmp", "abfs://container/path/within/container") - assert fp.get_filesystem().account_name == "accountname" - assert fp.get_filesystem().account_key == "accountkey" - assert fp.get_filesystem().sync_credential is None + mock_from_env.return_value = mock.Mock() + mock_from_env.assert_called_with( + "", + config={ + "account_name": "accountname", + "account_key": account_key_base64, + }, + ) -def test_initialise_azure_file_provider_with_service_principal(): + +@mock.patch("obstore.store.AzureStore.from_env") +def test_initialise_azure_file_provider_with_service_principal(mock_from_env): with mock.patch.dict( os.environ, { @@ -175,14 +187,21 @@ def test_initialise_azure_file_provider_with_service_principal(): }, ): fp = FileAccessProvider("/tmp", "abfs://container/path/within/container") - assert fp.get_filesystem().account_name == "accountname" - assert isinstance(fp.get_filesystem().sync_credential, ClientSecretCredential) - assert fp.get_filesystem().client_secret == "clientsecret" - assert fp.get_filesystem().client_id == "clientid" - assert fp.get_filesystem().tenant_id == "tenantid" + + mock_from_env.return_value = mock.Mock() + mock_from_env.assert_called_with( + "", + config={ + "account_name": "accountname", + "client_secret": "clientsecret", + "client_id": "clientid", + "tenant_id": "tenantid", + }, + ) -def test_initialise_azure_file_provider_with_default_credential(): +@mock.patch("obstore.store.AzureStore.from_env") +def test_initialise_azure_file_provider_with_default_credential(mock_from_env): with mock.patch.dict( os.environ, { @@ -191,8 +210,14 @@ def test_initialise_azure_file_provider_with_default_credential(): }, ): fp = FileAccessProvider("/tmp", "abfs://container/path/within/container") - assert fp.get_filesystem().account_name == "accountname" - assert isinstance(fp.get_filesystem().sync_credential, DefaultAzureCredential) + + mock_from_env.return_value = mock.Mock() + mock_from_env.assert_called_with( + "", + config={ + "account_name": "accountname", + }, + ) def test_get_file_system(): From 42cc75f7d2cfd3e0b3a10680850ed8fe620134e1 Mon Sep 17 00:00:00 2001 From: machichima Date: Wed, 8 Jan 2025 23:16:46 +0800 Subject: [PATCH 13/42] fix: use correct anon key for s3 and azure Signed-off-by: machichima --- flytekit/core/data_persistence.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index 498ecd95cf..6adc18755e 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -46,11 +46,11 @@ from flytekit.loggers import logger from flytekit.utils.asyn import loop_manager -# Refer to https://github.com/fsspec/s3fs/blob/50bafe4d8766c3b2a4e1fc09669cf02fb2d71454/s3fs/core.py#L198 +# Refer to https://github.com/developmentseed/obstore/blob/33654fc37f19a657689eb93327b621e9f9e01494/obstore/python/obstore/store/_aws.pyi#L11 # for key and secret _FSSPEC_S3_KEY_ID = "access_key_id" _FSSPEC_S3_SECRET = "secret_access_key" -_ANON = "anon" +_ANON = "skip_signature" Uploadable = typing.Union[str, os.PathLike, pathlib.Path, bytes, io.BufferedReader, io.BytesIO, io.StringIO] @@ -68,7 +68,8 @@ def s3_setup_args(s3_cfg: configuration.S3Config, bucket: str = "", anonymous: b # S3fs takes this as a special arg if s3_cfg.endpoint is not None: store_kwargs["endpoint_url"] = s3_cfg.endpoint - # kwargs["client_kwargs"] = {"endpoint_url": s3_cfg.endpoint} + if anonymous: + store_kwargs[_ANON] = "true" store = S3Store.from_env( bucket, @@ -81,9 +82,6 @@ def s3_setup_args(s3_cfg: configuration.S3Config, bucket: str = "", anonymous: b kwargs["retries"] = s3_cfg.retries - if anonymous: - kwargs[_ANON] = True - kwargs["store"] = store return kwargs @@ -157,6 +155,8 @@ def azure_setup_args( store_kwargs["client_secret"] = azure_cfg.client_secret if azure_cfg.tenant_id: store_kwargs["tenant_id"] = azure_cfg.tenant_id + if anonymous: + kwargs[_ANON] = "true" store = AzureStore.from_env( container, @@ -167,8 +167,6 @@ def azure_setup_args( kwargs["store"] = store - if anonymous: - kwargs[_ANON] = True return kwargs From a1c99ec54890c788105a92850e4082fccb1bd710 Mon Sep 17 00:00:00 2001 From: machichima Date: Wed, 8 Jan 2025 23:17:03 +0800 Subject: [PATCH 14/42] fix: use defined DEFAULT_BLOCK_SIZE Signed-off-by: machichima --- flytekit/core/obstore_filesystem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flytekit/core/obstore_filesystem.py b/flytekit/core/obstore_filesystem.py index 26ee3dec58..5d5fb8d77a 100644 --- a/flytekit/core/obstore_filesystem.py +++ b/flytekit/core/obstore_filesystem.py @@ -18,7 +18,7 @@ class ObstoreS3FileSystem(AsyncFsspecStore): connect_timeout = 5 retries = 5 read_timeout = 15 - default_block_size = 5 * 2**20 + default_block_size = DEFAULT_BLOCK_SIZE protocol = ("s3", "s3a") _extra_tokenize_attributes = ("default_block_size",) From 9c7e8dbe4fd85d70930510aad0975cbed8ce84d0 Mon Sep 17 00:00:00 2001 From: machichima Date: Fri, 10 Jan 2025 22:13:51 +0800 Subject: [PATCH 15/42] build: update obstore to 0.3.0b10 Signed-off-by: machichima --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bf66abce2a..731de3d39d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ dependencies = [ "marshmallow-jsonschema>=0.12.0", "mashumaro>=3.15", "msgpack>=1.1.0", - "obstore==0.3.0b9", + "obstore==0.3.0b10", "protobuf!=4.25.0", "pygments", "python-json-logger>=2.0.0", From 6fba41b9f41edfbf430a5118d6c86978ea8f1410 Mon Sep 17 00:00:00 2001 From: machichima Date: Sat, 11 Jan 2025 20:43:35 +0800 Subject: [PATCH 16/42] feat: lru_cache & enable read public bucket Signed-off-by: machichima --- flytekit/core/data_persistence.py | 65 +++++++++++++++++++------------ 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index 6adc18755e..e206f37652 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -24,6 +24,7 @@ import pathlib import tempfile import typing +from functools import lru_cache from time import sleep from typing import Any, Dict, Optional, Tuple, Union, cast from uuid import UUID @@ -33,6 +34,7 @@ from fsspec.asyn import AsyncFileSystem from fsspec.utils import get_protocol from obstore.store import AzureStore, GCSStore, S3Store +from obstore.exceptions import GenericError from typing_extensions import Unpack from flytekit import configuration @@ -55,6 +57,34 @@ Uploadable = typing.Union[str, os.PathLike, pathlib.Path, bytes, io.BufferedReader, io.BytesIO, io.StringIO] +@lru_cache +def s3store_from_env(bucket: str, retries: int, **store_kwargs) -> S3Store: + store = S3Store.from_env( + bucket, + config={ + **store_kwargs, + "aws_allow_http": "true", # Allow HTTP connections + "aws_virtual_hosted_style_request": "false", # Use path-style addressing + }, + ) + return store + + +@lru_cache +def gcsstore_from_env(bucket: str) -> GCSStore: + store = GCSStore.from_env(bucket) + return store + + +@lru_cache +def azurestore_from_env(container: str, **store_kwargs) -> AzureStore: + store = AzureStore.from_env( + container, + config=store_kwargs, + ) + return store + + def s3_setup_args(s3_cfg: configuration.S3Config, bucket: str = "", anonymous: bool = False) -> Dict[str, Any]: kwargs: Dict[str, Any] = {} store_kwargs: Dict[str, Any] = {} @@ -71,14 +101,7 @@ def s3_setup_args(s3_cfg: configuration.S3Config, bucket: str = "", anonymous: b if anonymous: store_kwargs[_ANON] = "true" - store = S3Store.from_env( - bucket, - config={ - **store_kwargs, - "aws_allow_http": "true", # Allow HTTP connections - "aws_virtual_hosted_style_request": "false", # Use path-style addressing - }, - ) + store = s3store_from_env(bucket, s3_cfg.retries, **store_kwargs) kwargs["retries"] = s3_cfg.retries @@ -90,12 +113,7 @@ def s3_setup_args(s3_cfg: configuration.S3Config, bucket: str = "", anonymous: b def gs_setup_args(gcs_cfg: configuration.GCSConfig, bucket: str = "", anonymous: bool = False) -> Dict[str, Any]: kwargs: Dict[str, Any] = {} - store = GCSStore.from_env( - bucket, - ) - - if anonymous: - kwargs["token"] = _ANON + store = gcsstore_from_env(bucket) kwargs["store"] = store @@ -158,16 +176,10 @@ def azure_setup_args( if anonymous: kwargs[_ANON] = "true" - store = AzureStore.from_env( - container, - config={ - **store_kwargs, - }, - ) + store = azurestore_from_env(container, **store_kwargs) kwargs["store"] = store - return kwargs @@ -181,8 +193,6 @@ def get_fsspec_storage_options( if protocol == "s3": return {**s3_setup_args(data_config.s3, anonymous=anonymous), **kwargs} if protocol == "gs": - if anonymous: - kwargs["token"] = _ANON return kwargs if protocol in ("abfs", "abfss"): return {**azure_setup_args(data_config.azure, anonymous=anonymous), **kwargs} @@ -405,10 +415,15 @@ async def get(self, from_path: str, to_path: str, recursive: bool = False, **kwa if isinstance(dst, (str, pathlib.Path)): return dst return to_path - except OSError as oe: + except (OSError, GenericError) as oe: logger.debug(f"Error in getting {from_path} to {to_path} rec {recursive} {oe}") if isinstance(file_system, AsyncFileSystem): - exists = await file_system._exists(from_path) # pylint: disable=W0212 + try: + exists = await file_system._exists(from_path) # pylint: disable=W0212 + except GenericError: + # for obstore, as it does not raise FileNotFoundError in fsspec but GenericError + # force it to try get_filesystem(anonymous=True) + exists = True else: exists = file_system.exists(from_path) if not exists: From 99f9ede3b328d37bcdbe19f1d597e4b33df0a01e Mon Sep 17 00:00:00 2001 From: machichima Date: Sat, 11 Jan 2025 20:48:44 +0800 Subject: [PATCH 17/42] test: update test Signed-off-by: machichima --- tests/flytekit/unit/core/test_data.py | 63 +++++++++---------- .../unit/core/test_data_persistence.py | 8 +-- 2 files changed, 33 insertions(+), 38 deletions(-) diff --git a/tests/flytekit/unit/core/test_data.py b/tests/flytekit/unit/core/test_data.py index 0d16330e3d..a76523d5e9 100644 --- a/tests/flytekit/unit/core/test_data.py +++ b/tests/flytekit/unit/core/test_data.py @@ -238,7 +238,7 @@ def test_local_provider_get_empty(): @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -@mock.patch("obstore.store.S3Store.from_env") +@mock.patch("flytekit.core.data_persistence.s3store_from_env") def test_s3_setup_args_env_empty(mock_from_env, mock_os, mock_get_config_file): mock_get_config_file.return_value = None mock_os.get.return_value = None @@ -246,13 +246,7 @@ def test_s3_setup_args_env_empty(mock_from_env, mock_os, mock_get_config_file): kwargs = s3_setup_args(s3c) mock_from_env.return_value = mock.Mock() - mock_from_env.assert_called_with( - "", - config={ - "aws_allow_http": "true", # Allow HTTP connections - "aws_virtual_hosted_style_request": "false", # Use path-style addressing - }, - ) + mock_from_env.assert_called_with("") @mock.patch("flytekit.configuration.get_config_file") @@ -284,7 +278,7 @@ def test_s3_setup_args_env_both(mock_from_env, mock_os, mock_get_config_file): @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -@mock.patch("obstore.store.S3Store.from_env") +@mock.patch("flytekit.core.data_persistence.s3store_from_env") def test_s3_setup_args_env_flyte(mock_from_env, mock_os, mock_get_config_file): mock_get_config_file.return_value = None ee = { @@ -297,18 +291,14 @@ def test_s3_setup_args_env_flyte(mock_from_env, mock_os, mock_get_config_file): mock_from_env.return_value = mock.Mock() mock_from_env.assert_called_with( "", - config={ - "access_key_id": "flyte", - "secret_access_key": "flyte-secret", - "aws_allow_http": "true", # Allow HTTP connections - "aws_virtual_hosted_style_request": "false", # Use path-style addressing - }, + access_key_id = "flyte", + secret_access_key = "flyte-secret", ) @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -@mock.patch("obstore.store.S3Store.from_env") +@mock.patch("flytekit.core.data_persistence.s3store_from_env") def test_s3_setup_args_env_aws(mock_from_env, mock_os, mock_get_config_file): mock_get_config_file.return_value = None ee = { @@ -319,13 +309,7 @@ def test_s3_setup_args_env_aws(mock_from_env, mock_os, mock_get_config_file): kwargs = s3_setup_args(S3Config.auto()) mock_from_env.return_value = mock.Mock() - mock_from_env.assert_called_with( - "", - config={ - "aws_allow_http": "true", # Allow HTTP connections - "aws_virtual_hosted_style_request": "false", # Use path-style addressing - }, - ) + mock_from_env.assert_called_with("") @mock.patch("flytekit.configuration.get_config_file") @@ -351,7 +335,7 @@ def test_get_fsspec_storage_options_gcs_with_overrides(mock_os, mock_get_config_ storage_options = get_fsspec_storage_options( "gs", DataConfig.auto(), anonymous=True, other_argument="value" ) - assert storage_options == {"token": "anon", "other_argument": "value"} + assert storage_options == {"other_argument": "value"} @mock.patch("flytekit.configuration.get_config_file") @@ -388,22 +372,35 @@ def test_get_fsspec_storage_options_azure(mock_from_env, mock_os, mock_get_confi @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -def test_get_fsspec_storage_options_azure_with_overrides(mock_os, mock_get_config_file): +@mock.patch("flytekit.core.data_persistence.azurestore_from_env") +def test_get_fsspec_storage_options_azure_with_overrides(mock_from_env, mock_os, mock_get_config_file): mock_get_config_file.return_value = None + + account_key = "accountkey" + account_key_base64 = base64.b64encode(account_key.encode()).decode() + ee = { "FLYTE_AZURE_STORAGE_ACCOUNT_NAME": "accountname", - "FLYTE_AZURE_STORAGE_ACCOUNT_KEY": "accountkey", + "FLYTE_AZURE_STORAGE_ACCOUNT_KEY": account_key_base64, } mock_os.get.side_effect = lambda x, y: ee.get(x) storage_options = get_fsspec_storage_options( - "abfs", DataConfig.auto(), anonymous=True, account_name="other_accountname", other_argument="value" + "abfs", + DataConfig.auto(), + anonymous=True, + account_name="other_accountname", + other_argument="value", ) - assert storage_options == { - "account_name": "other_accountname", - "account_key": "accountkey", - "anon": True, - "other_argument": "value", - } + + # TODO: fix the code to support this test case + mock_from_env.return_value = mock.Mock() + mock_from_env.assert_called_with( + "", + account_name = "other_accountname", + account_key = account_key_base64, + skip_signature = "true", + ) + def test_crawl_local_nt(source_folder): diff --git a/tests/flytekit/unit/core/test_data_persistence.py b/tests/flytekit/unit/core/test_data_persistence.py index 776c2df01a..288f7aef1d 100644 --- a/tests/flytekit/unit/core/test_data_persistence.py +++ b/tests/flytekit/unit/core/test_data_persistence.py @@ -154,7 +154,7 @@ def test_generate_new_custom_path(): assert np == "s3://foo-bucket/my-default-prefix/bar.txt" -@mock.patch("obstore.store.AzureStore.from_env") +@mock.patch("flytekit.core.data_persistence.azurestore_from_env") def test_initialise_azure_file_provider_with_account_key(mock_from_env): account_key = "accountkey" account_key_base64 = base64.b64encode(account_key.encode()).decode() @@ -168,10 +168,8 @@ def test_initialise_azure_file_provider_with_account_key(mock_from_env): mock_from_env.return_value = mock.Mock() mock_from_env.assert_called_with( "", - config={ - "account_name": "accountname", - "account_key": account_key_base64, - }, + account_name = "accountname", + account_key = account_key_base64, ) From f317ff7e234abfbf6c4cc6257a129f9c2d89d75d Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 12 Jan 2025 23:51:10 +0800 Subject: [PATCH 18/42] feat: override storage options for s3 and azxure Signed-off-by: machichima --- flytekit/core/data_persistence.py | 145 ++++++++++++++++++------------ 1 file changed, 90 insertions(+), 55 deletions(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index e206f37652..80f1f92f2b 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -33,16 +33,23 @@ from decorator import decorator from fsspec.asyn import AsyncFileSystem from fsspec.utils import get_protocol -from obstore.store import AzureStore, GCSStore, S3Store from obstore.exceptions import GenericError +from obstore.store import AzureStore, GCSStore, S3Store from typing_extensions import Unpack from flytekit import configuration from flytekit.configuration import DataConfig from flytekit.core.local_fsspec import FlyteLocalFileSystem -from flytekit.core.obstore_filesystem import ObstoreAzureBlobFileSystem, ObstoreGCSFileSystem, ObstoreS3FileSystem +from flytekit.core.obstore_filesystem import ( + ObstoreAzureBlobFileSystem, + ObstoreGCSFileSystem, + ObstoreS3FileSystem, +) from flytekit.core.utils import timeit -from flytekit.exceptions.system import FlyteDownloadDataException, FlyteUploadDataException +from flytekit.exceptions.system import ( + FlyteDownloadDataException, + FlyteUploadDataException, +) from flytekit.exceptions.user import FlyteAssertion, FlyteDataNotFoundException from flytekit.interfaces.random import random from flytekit.loggers import logger @@ -80,30 +87,30 @@ def gcsstore_from_env(bucket: str) -> GCSStore: def azurestore_from_env(container: str, **store_kwargs) -> AzureStore: store = AzureStore.from_env( container, - config=store_kwargs, + config=store_kwargs, # type: ignore ) return store -def s3_setup_args(s3_cfg: configuration.S3Config, bucket: str = "", anonymous: bool = False) -> Dict[str, Any]: - kwargs: Dict[str, Any] = {} +def s3_setup_args( + s3_cfg: configuration.S3Config, bucket: str = "", anonymous: bool = False, **kwargs +) -> Dict[str, Any]: store_kwargs: Dict[str, Any] = {} - if s3_cfg.access_key_id: - store_kwargs[_FSSPEC_S3_KEY_ID] = s3_cfg.access_key_id - - if s3_cfg.secret_access_key: - store_kwargs[_FSSPEC_S3_SECRET] = s3_cfg.secret_access_key - - # S3fs takes this as a special arg - if s3_cfg.endpoint is not None: - store_kwargs["endpoint_url"] = s3_cfg.endpoint + if _FSSPEC_S3_KEY_ID in kwargs or s3_cfg.access_key_id is not None: + store_kwargs[_FSSPEC_S3_KEY_ID] = kwargs.get(_FSSPEC_S3_KEY_ID, s3_cfg.access_key_id) + if _FSSPEC_S3_SECRET in kwargs or s3_cfg.secret_access_key is not None: + store_kwargs[_FSSPEC_S3_SECRET] = kwargs.get(_FSSPEC_S3_SECRET, s3_cfg.secret_access_key) + if "endpoint_url" in kwargs or s3_cfg.endpoint is not None: + store_kwargs["endpoint_url"] = kwargs.get("endpoint_url", s3_cfg.endpoint) + if "retries" in kwargs or s3_cfg.retries is not None: + retries = kwargs.get("retries", s3_cfg.retries) + if "backoff" in kwargs or s3_cfg.backoff is not None: + backoff = kwargs.get("backoff", s3_cfg.backoff) if anonymous: store_kwargs[_ANON] = "true" - store = s3store_from_env(bucket, s3_cfg.retries, **store_kwargs) - - kwargs["retries"] = s3_cfg.retries + store = s3store_from_env(bucket, retries, backoff, **store_kwargs) kwargs["store"] = store @@ -120,6 +127,34 @@ def gs_setup_args(gcs_cfg: configuration.GCSConfig, bucket: str = "", anonymous: return kwargs +def azure_setup_args( + azure_cfg: configuration.AzureBlobStorageConfig, + container: str = "", + anonymous: bool = False, + **kwargs, +) -> Dict[str, Any]: + store_kwargs: Dict[str, Any] = {} + + if "account_name" in kwargs or azure_cfg.account_name is not None: + store_kwargs["account_name"] = kwargs.get("account_name", azure_cfg.account_name) + if "account_key" in kwargs or azure_cfg.account_key is not None: + store_kwargs["account_key"] = kwargs.get("account_key", azure_cfg.account_key) + if "client_id" in kwargs or azure_cfg.client_id is not None: + store_kwargs["client_id"] = kwargs.get("client_id", azure_cfg.client_id) + if "client_secret" in kwargs or azure_cfg.client_secret is not None: + store_kwargs["client_secret"] = kwargs.get("client_secret", azure_cfg.client_secret) + if "tenant_id" in kwargs or azure_cfg.tenant_id is not None: + store_kwargs["tenant_id"] = kwargs.get("tenant_id", azure_cfg.tenant_id) + if anonymous: + store_kwargs[_ANON] = "true" + + store = azurestore_from_env(container, **store_kwargs) + + kwargs["store"] = store + + return kwargs + + def split_path(path: str) -> Tuple[str, str]: """ Split bucket and file path @@ -157,45 +192,28 @@ def split_path(path: str) -> Tuple[str, str]: return (bucket, file_path) -def azure_setup_args( - azure_cfg: configuration.AzureBlobStorageConfig, container: str = "", anonymous: bool = False -) -> Dict[str, Any]: - kwargs: Dict[str, Any] = {} - store_kwargs: Dict[str, Any] = {} - - if azure_cfg.account_name: - store_kwargs["account_name"] = azure_cfg.account_name - if azure_cfg.account_key: - store_kwargs["account_key"] = azure_cfg.account_key - if azure_cfg.client_id: - store_kwargs["client_id"] = azure_cfg.client_id - if azure_cfg.client_secret: - store_kwargs["client_secret"] = azure_cfg.client_secret - if azure_cfg.tenant_id: - store_kwargs["tenant_id"] = azure_cfg.tenant_id - if anonymous: - kwargs[_ANON] = "true" - - store = azurestore_from_env(container, **store_kwargs) - - kwargs["store"] = store - - return kwargs - - def get_fsspec_storage_options( - protocol: str, data_config: typing.Optional[DataConfig] = None, anonymous: bool = False, **kwargs + protocol: str, + data_config: typing.Optional[DataConfig] = None, + anonymous: bool = False, + **kwargs, ) -> Dict[str, Any]: data_config = data_config or DataConfig.auto() if protocol == "file": return {"auto_mkdir": True, **kwargs} if protocol == "s3": - return {**s3_setup_args(data_config.s3, anonymous=anonymous), **kwargs} + return { + **s3_setup_args(data_config.s3, anonymous=anonymous, **kwargs), + **kwargs, + } if protocol == "gs": return kwargs if protocol in ("abfs", "abfss"): - return {**azure_setup_args(data_config.azure, anonymous=anonymous), **kwargs} + return { + **azure_setup_args(data_config.azure, anonymous=anonymous, **kwargs), + **kwargs, + } return {} @@ -290,7 +308,7 @@ def get_filesystem( kwargs["auto_mkdir"] = True return FlyteLocalFileSystem(**kwargs) elif protocol == "s3": - s3kwargs = s3_setup_args(self._data_config.s3, bucket, anonymous=anonymous) + s3kwargs = s3_setup_args(self._data_config.s3, bucket, anonymous=anonymous, **kwargs) s3kwargs.update(kwargs) return fsspec.filesystem(protocol, **s3kwargs) # type: ignore elif protocol == "gs": @@ -298,7 +316,7 @@ def get_filesystem( gskwargs.update(kwargs) return fsspec.filesystem(protocol, **gskwargs) # type: ignore elif protocol == "abfs": - azkwargs = azure_setup_args(self._data_config.azure, bucket, anonymous=anonymous) + azkwargs = azure_setup_args(self._data_config.azure, bucket, anonymous=anonymous, **kwargs) azkwargs.update(kwargs) return fsspec.filesystem(protocol, **azkwargs) # type: ignore elif protocol == "ftp": @@ -306,7 +324,10 @@ def get_filesystem( return fsspec.filesystem(protocol, **kwargs) storage_options = get_fsspec_storage_options( - protocol=protocol, anonymous=anonymous, data_config=self._data_config, **kwargs + protocol=protocol, + anonymous=anonymous, + data_config=self._data_config, + **kwargs, ) kwargs.update(storage_options) @@ -319,7 +340,13 @@ async def get_async_filesystem_for_path( loop = asyncio.get_running_loop() return self.get_filesystem( - protocol, anonymous=anonymous, path=path, bucket=bucket, asynchronous=True, loop=loop, **kwargs + protocol, + anonymous=anonymous, + path=path, + bucket=bucket, + asynchronous=True, + loop=loop, + **kwargs, ) def get_filesystem_for_path( @@ -405,7 +432,9 @@ async def get(self, from_path: str, to_path: str, recursive: bool = False, **kwa import shutil return shutil.copytree( - self.strip_file_header(from_path), self.strip_file_header(to_path), dirs_exist_ok=True + self.strip_file_header(from_path), + self.strip_file_header(to_path), + dirs_exist_ok=True, ) logger.info(f"Getting {from_path} to {to_path}") if isinstance(file_system, AsyncFileSystem): @@ -421,7 +450,7 @@ async def get(self, from_path: str, to_path: str, recursive: bool = False, **kwa try: exists = await file_system._exists(from_path) # pylint: disable=W0212 except GenericError: - # for obstore, as it does not raise FileNotFoundError in fsspec but GenericError + # for obstore, as it does not raise FileNotFoundError in fsspec but GenericError # force it to try get_filesystem(anonymous=True) exists = True else: @@ -454,7 +483,9 @@ async def _put(self, from_path: str, to_path: str, recursive: bool = False, **kw import shutil return shutil.copytree( - self.strip_file_header(from_path), self.strip_file_header(to_path), dirs_exist_ok=True + self.strip_file_header(from_path), + self.strip_file_header(to_path), + dirs_exist_ok=True, ) from_path, to_path = self.recursive_paths(from_path, to_path) if self._execution_metadata: @@ -714,7 +745,11 @@ async def async_get_data(self, remote_path: str, local_path: str, is_multipart: get_data = loop_manager.synced(async_get_data) async def async_put_data( - self, local_path: Union[str, os.PathLike], remote_path: str, is_multipart: bool = False, **kwargs + self, + local_path: Union[str, os.PathLike], + remote_path: str, + is_multipart: bool = False, + **kwargs, ) -> str: """ The implication here is that we're always going to put data to the remote location, so we .remote to ensure From 749a7fe3b390f944bc7b07c42da1857febea7ff1 Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 13 Jan 2025 00:00:30 +0800 Subject: [PATCH 19/42] test: update test Signed-off-by: machichima --- tests/flytekit/unit/core/test_data.py | 29 ++++++++++++++++++--------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/tests/flytekit/unit/core/test_data.py b/tests/flytekit/unit/core/test_data.py index a76523d5e9..b61688b8ff 100644 --- a/tests/flytekit/unit/core/test_data.py +++ b/tests/flytekit/unit/core/test_data.py @@ -1,3 +1,4 @@ +from datetime import timedelta import os import random import shutil @@ -246,12 +247,16 @@ def test_s3_setup_args_env_empty(mock_from_env, mock_os, mock_get_config_file): kwargs = s3_setup_args(s3c) mock_from_env.return_value = mock.Mock() - mock_from_env.assert_called_with("") + mock_from_env.assert_called_with( + "", + 3, # retries + timedelta(seconds=5), # backoff + ) @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -@mock.patch("obstore.store.S3Store.from_env") +@mock.patch("flytekit.core.data_persistence.s3store_from_env") def test_s3_setup_args_env_both(mock_from_env, mock_os, mock_get_config_file): mock_get_config_file.return_value = None @@ -267,12 +272,10 @@ def test_s3_setup_args_env_both(mock_from_env, mock_os, mock_get_config_file): mock_from_env.return_value = mock.Mock() mock_from_env.assert_called_with( "", - config={ - "access_key_id": "flyte", - "secret_access_key": "flyte-secret", - "aws_allow_http": "true", # Allow HTTP connections - "aws_virtual_hosted_style_request": "false", # Use path-style addressing - }, + 3, # retries + timedelta(seconds=5), # backoff + access_key_id = "flyte", + secret_access_key = "flyte-secret", ) @@ -291,6 +294,8 @@ def test_s3_setup_args_env_flyte(mock_from_env, mock_os, mock_get_config_file): mock_from_env.return_value = mock.Mock() mock_from_env.assert_called_with( "", + 3, # retries + timedelta(seconds=5), # backoff access_key_id = "flyte", secret_access_key = "flyte-secret", ) @@ -309,7 +314,12 @@ def test_s3_setup_args_env_aws(mock_from_env, mock_os, mock_get_config_file): kwargs = s3_setup_args(S3Config.auto()) mock_from_env.return_value = mock.Mock() - mock_from_env.assert_called_with("") + mock_from_env.assert_called_with( + "", + 3, # retries + timedelta(seconds=5), # backoff + + ) @mock.patch("flytekit.configuration.get_config_file") @@ -392,7 +402,6 @@ def test_get_fsspec_storage_options_azure_with_overrides(mock_from_env, mock_os, other_argument="value", ) - # TODO: fix the code to support this test case mock_from_env.return_value = mock.Mock() mock_from_env.assert_called_with( "", From 31d88806bbbfbee2349b20f084f2cd8087332522 Mon Sep 17 00:00:00 2001 From: machichima Date: Wed, 15 Jan 2025 22:31:05 +0800 Subject: [PATCH 20/42] fix: update storage config Signed-off-by: machichima --- flytekit/core/data_persistence.py | 13 ++++++++++++- flytekit/core/obstore_filesystem.py | 24 ++---------------------- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index 80f1f92f2b..4717a0244f 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -24,6 +24,7 @@ import pathlib import tempfile import typing +from datetime import timedelta from functools import lru_cache from time import sleep from typing import Any, Dict, Optional, Tuple, Union, cast @@ -65,7 +66,7 @@ @lru_cache -def s3store_from_env(bucket: str, retries: int, **store_kwargs) -> S3Store: +def s3store_from_env(bucket: str, retries: int, backoff: timedelta, **store_kwargs) -> S3Store: store = S3Store.from_env( bucket, config={ @@ -73,6 +74,16 @@ def s3store_from_env(bucket: str, retries: int, **store_kwargs) -> S3Store: "aws_allow_http": "true", # Allow HTTP connections "aws_virtual_hosted_style_request": "false", # Use path-style addressing }, + client_options={"timeout": "999999s"}, # need to put this to somewhere for user to config? + retry_config={ + "max_retries": retries, + "backoff": { + "base": 2, + "init_backoff": backoff, + "max_backoff": timedelta(seconds=16), + }, + "retry_timeout": timedelta(minutes=3), + }, ) return store diff --git a/flytekit/core/obstore_filesystem.py b/flytekit/core/obstore_filesystem.py index 5d5fb8d77a..b82e8a6d1c 100644 --- a/flytekit/core/obstore_filesystem.py +++ b/flytekit/core/obstore_filesystem.py @@ -2,8 +2,6 @@ Classes that overrides the AsyncFsspecStore that specify the filesystem specific parameters """ -from typing import Optional - from obstore.fsspec import AsyncFsspecStore DEFAULT_BLOCK_SIZE = 5 * 2**20 @@ -15,26 +13,10 @@ class ObstoreS3FileSystem(AsyncFsspecStore): """ root_marker = "" - connect_timeout = 5 - retries = 5 - read_timeout = 15 - default_block_size = DEFAULT_BLOCK_SIZE + blocksize = DEFAULT_BLOCK_SIZE protocol = ("s3", "s3a") _extra_tokenize_attributes = ("default_block_size",) - def __init__(self, retries: Optional[int] = None, **kwargs): - """ - Initialize the ObstoreS3FileSystem with optional retries. - - Args: - retries (int): Number of retry for requests - **kwargs: Other keyword arguments passed to the parent class - """ - if retries is not None: - self.retries = retries - - super().__init__(**kwargs) - class ObstoreGCSFileSystem(AsyncFsspecStore): """ @@ -42,10 +24,8 @@ class ObstoreGCSFileSystem(AsyncFsspecStore): """ scopes = {"read_only", "read_write", "full_control"} - retries = 6 # number of retries on http failure - default_block_size = DEFAULT_BLOCK_SIZE + blocksize = DEFAULT_BLOCK_SIZE protocol = "gcs", "gs" - async_impl = True class ObstoreAzureBlobFileSystem(AsyncFsspecStore): From 9645dc7fb07d8d40b8f3eec3a3a37a61857aef62 Mon Sep 17 00:00:00 2001 From: machichima Date: Thu, 23 Jan 2025 23:01:12 +0800 Subject: [PATCH 21/42] fix: update obstore client_options Signed-off-by: machichima --- flytekit/core/data_persistence.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index 4717a0244f..19d08d19f9 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -71,10 +71,9 @@ def s3store_from_env(bucket: str, retries: int, backoff: timedelta, **store_kwar bucket, config={ **store_kwargs, - "aws_allow_http": "true", # Allow HTTP connections "aws_virtual_hosted_style_request": "false", # Use path-style addressing }, - client_options={"timeout": "999999s"}, # need to put this to somewhere for user to config? + client_options={"timeout": "99999s", "allow_http": "true"}, retry_config={ "max_retries": retries, "backoff": { From 5ac0766c264df39f0c0f6ee778318090b35ad768 Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 28 Jan 2025 23:08:18 +0800 Subject: [PATCH 22/42] refactor: comments + config format Signed-off-by: machichima --- flytekit/core/data_persistence.py | 35 ++++++++++++++++++------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index 19d08d19f9..ca4282f837 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -67,13 +67,12 @@ @lru_cache def s3store_from_env(bucket: str, retries: int, backoff: timedelta, **store_kwargs) -> S3Store: + # Config refer to https://developmentseed.org/obstore/latest/api/store/aws/#obstore.store.S3Store.from_env store = S3Store.from_env( bucket, - config={ - **store_kwargs, - "aws_virtual_hosted_style_request": "false", # Use path-style addressing - }, - client_options={"timeout": "99999s", "allow_http": "true"}, + **store_kwargs, + virtual_hosted_style_request=False, # not include bucket name in the domain name + client_options={"timeout": "99999s", "allow_http": True}, retry_config={ "max_retries": retries, "backoff": { @@ -84,6 +83,7 @@ def s3store_from_env(bucket: str, retries: int, backoff: timedelta, **store_kwar "retry_timeout": timedelta(minutes=3), }, ) + return store @@ -95,16 +95,17 @@ def gcsstore_from_env(bucket: str) -> GCSStore: @lru_cache def azurestore_from_env(container: str, **store_kwargs) -> AzureStore: - store = AzureStore.from_env( - container, - config=store_kwargs, # type: ignore - ) + # Config refer to https://developmentseed.org/obstore/latest/api/store/azure/#obstore.store.AzureStore.from_env + store = AzureStore.from_env(container, **store_kwargs) return store def s3_setup_args( s3_cfg: configuration.S3Config, bucket: str = "", anonymous: bool = False, **kwargs ) -> Dict[str, Any]: + """ + Setup s3 storage, bucket is needed to create obstore store object + """ store_kwargs: Dict[str, Any] = {} if _FSSPEC_S3_KEY_ID in kwargs or s3_cfg.access_key_id is not None: @@ -114,13 +115,13 @@ def s3_setup_args( if "endpoint_url" in kwargs or s3_cfg.endpoint is not None: store_kwargs["endpoint_url"] = kwargs.get("endpoint_url", s3_cfg.endpoint) if "retries" in kwargs or s3_cfg.retries is not None: - retries = kwargs.get("retries", s3_cfg.retries) + store_kwargs["retries"] = kwargs.get("retries", s3_cfg.retries) if "backoff" in kwargs or s3_cfg.backoff is not None: - backoff = kwargs.get("backoff", s3_cfg.backoff) + store_kwargs["backoff"] = kwargs.get("backoff", s3_cfg.backoff) if anonymous: store_kwargs[_ANON] = "true" - store = s3store_from_env(bucket, retries, backoff, **store_kwargs) + store = s3store_from_env(bucket, **store_kwargs) kwargs["store"] = store @@ -128,6 +129,9 @@ def s3_setup_args( def gs_setup_args(gcs_cfg: configuration.GCSConfig, bucket: str = "", anonymous: bool = False) -> Dict[str, Any]: + """ + Setup gcs storage, bucket is needed to create obstore store object + """ kwargs: Dict[str, Any] = {} store = gcsstore_from_env(bucket) @@ -139,10 +143,13 @@ def gs_setup_args(gcs_cfg: configuration.GCSConfig, bucket: str = "", anonymous: def azure_setup_args( azure_cfg: configuration.AzureBlobStorageConfig, - container: str = "", + bucket: str = "", anonymous: bool = False, **kwargs, ) -> Dict[str, Any]: + """ + Setup azure blob storage, bucket is needed to create obstore store object + """ store_kwargs: Dict[str, Any] = {} if "account_name" in kwargs or azure_cfg.account_name is not None: @@ -158,7 +165,7 @@ def azure_setup_args( if anonymous: store_kwargs[_ANON] = "true" - store = azurestore_from_env(container, **store_kwargs) + store = azurestore_from_env(bucket, **store_kwargs) kwargs["store"] = store From 8a0f2626dbf811477b73b226a3a8a15f47762f8b Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 28 Jan 2025 23:09:04 +0800 Subject: [PATCH 23/42] fix: add bucket to get_fsspec_storage_options Signed-off-by: machichima --- flytekit/core/data_persistence.py | 8 +++++--- flytekit/types/structured/basic_dfs.py | 10 ++++++++-- .../flytekitplugins/polars/sd_transformers.py | 8 +++++--- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index ca4282f837..ca64ed6ac8 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -213,6 +213,7 @@ def get_fsspec_storage_options( protocol: str, data_config: typing.Optional[DataConfig] = None, anonymous: bool = False, + bucket: str = "", **kwargs, ) -> Dict[str, Any]: data_config = data_config or DataConfig.auto() @@ -221,14 +222,14 @@ def get_fsspec_storage_options( return {"auto_mkdir": True, **kwargs} if protocol == "s3": return { - **s3_setup_args(data_config.s3, anonymous=anonymous, **kwargs), + **s3_setup_args(data_config.s3, bucket, anonymous=anonymous, **kwargs), **kwargs, } if protocol == "gs": - return kwargs + return {**gs_setup_args(data_config.gcs, bucket, anonymous=anonymous), **kwargs} if protocol in ("abfs", "abfss"): return { - **azure_setup_args(data_config.azure, anonymous=anonymous, **kwargs), + **azure_setup_args(data_config.azure, bucket, anonymous=anonymous, **kwargs), **kwargs, } return {} @@ -344,6 +345,7 @@ def get_filesystem( protocol=protocol, anonymous=anonymous, data_config=self._data_config, + bucket=bucket, **kwargs, ) kwargs.update(storage_options) diff --git a/flytekit/types/structured/basic_dfs.py b/flytekit/types/structured/basic_dfs.py index f9f5d536a6..0a44cead59 100644 --- a/flytekit/types/structured/basic_dfs.py +++ b/flytekit/types/structured/basic_dfs.py @@ -9,7 +9,7 @@ from flytekit import FlyteContext, lazy_module, logger from flytekit.configuration import DataConfig -from flytekit.core.data_persistence import get_fsspec_storage_options +from flytekit.core.data_persistence import get_fsspec_storage_options, split_path from flytekit.models import literals from flytekit.models.literals import StructuredDatasetMetadata from flytekit.models.types import StructuredDatasetType @@ -37,7 +37,13 @@ def get_pandas_storage_options( from pandas.io.common import is_fsspec_url if is_fsspec_url(uri): - return get_fsspec_storage_options(protocol=get_protocol(uri), data_config=data_config, anonymous=anonymous) + bucket, _ = split_path(uri) + return get_fsspec_storage_options( + protocol=get_protocol(uri), + data_config=data_config, + anonymous=anonymous, + bucket=bucket, + ) # Pandas does not allow storage_options for non-fsspec paths e.g. local. return None diff --git a/plugins/flytekit-polars/flytekitplugins/polars/sd_transformers.py b/plugins/flytekit-polars/flytekitplugins/polars/sd_transformers.py index e6359641ca..e749eec660 100644 --- a/plugins/flytekit-polars/flytekitplugins/polars/sd_transformers.py +++ b/plugins/flytekit-polars/flytekitplugins/polars/sd_transformers.py @@ -2,7 +2,7 @@ import typing from flytekit import FlyteContext, lazy_module -from flytekit.core.data_persistence import get_fsspec_storage_options +from flytekit.core.data_persistence import get_fsspec_storage_options, split_path from flytekit.models import literals from flytekit.models.literals import StructuredDatasetMetadata from flytekit.models.types import StructuredDatasetType @@ -91,10 +91,11 @@ def decode( current_task_metadata: StructuredDatasetMetadata, ) -> pl.DataFrame: uri = flyte_value.uri - + bucket, _ = split_path(uri) kwargs = get_fsspec_storage_options( protocol=fsspec_utils.get_protocol(uri), data_config=ctx.file_access.data_config, + bucket=bucket, ) if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns: columns = [c.name for c in current_task_metadata.structured_dataset_type.columns] @@ -153,10 +154,11 @@ def decode( current_task_metadata: StructuredDatasetMetadata, ) -> pl.LazyFrame: uri = flyte_value.uri - + bucket, _ = split_path(uri) kwargs = get_fsspec_storage_options( protocol=fsspec_utils.get_protocol(uri), data_config=ctx.file_access.data_config, + bucket=bucket, ) # use read_parquet instead of scan_parquet for now because scan_parquet currently doesn't work with fsspec: # https://github.com/pola-rs/polars/issues/16737 From 4009f80498d4802180a041f54fa8ee0e38c299a7 Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 28 Jan 2025 23:09:13 +0800 Subject: [PATCH 24/42] test: update test Signed-off-by: machichima --- tests/flytekit/unit/core/test_data.py | 48 +++++++++++-------- .../unit/core/test_data_persistence.py | 14 ++---- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/tests/flytekit/unit/core/test_data.py b/tests/flytekit/unit/core/test_data.py index b61688b8ff..6cb55a6675 100644 --- a/tests/flytekit/unit/core/test_data.py +++ b/tests/flytekit/unit/core/test_data.py @@ -249,8 +249,8 @@ def test_s3_setup_args_env_empty(mock_from_env, mock_os, mock_get_config_file): mock_from_env.return_value = mock.Mock() mock_from_env.assert_called_with( "", - 3, # retries - timedelta(seconds=5), # backoff + retries=3, + backoff=timedelta(seconds=5), ) @@ -272,8 +272,8 @@ def test_s3_setup_args_env_both(mock_from_env, mock_os, mock_get_config_file): mock_from_env.return_value = mock.Mock() mock_from_env.assert_called_with( "", - 3, # retries - timedelta(seconds=5), # backoff + retries=3, + backoff=timedelta(seconds=5), access_key_id = "flyte", secret_access_key = "flyte-secret", ) @@ -294,8 +294,8 @@ def test_s3_setup_args_env_flyte(mock_from_env, mock_os, mock_get_config_file): mock_from_env.return_value = mock.Mock() mock_from_env.assert_called_with( "", - 3, # retries - timedelta(seconds=5), # backoff + retries=3, + backoff=timedelta(seconds=5), access_key_id = "flyte", secret_access_key = "flyte-secret", ) @@ -316,27 +316,31 @@ def test_s3_setup_args_env_aws(mock_from_env, mock_os, mock_get_config_file): mock_from_env.return_value = mock.Mock() mock_from_env.assert_called_with( "", - 3, # retries - timedelta(seconds=5), # backoff - + retries=3, + backoff=timedelta(seconds=5), ) @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -def test_get_fsspec_storage_options_gcs(mock_os, mock_get_config_file): +@mock.patch("flytekit.core.data_persistence.gcsstore_from_env") +def test_get_fsspec_storage_options_gcs(mock_from_env, mock_os, mock_get_config_file): mock_get_config_file.return_value = None ee = { "FLYTE_GCP_GSUTIL_PARALLELISM": "False", } mock_os.get.side_effect = lambda x, y: ee.get(x) storage_options = get_fsspec_storage_options("gs", DataConfig.auto()) - assert storage_options == {} + mock_from_env.return_value = mock.Mock() + mock_from_env.assert_called_with( + "", # bucket name is empty + ) @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -def test_get_fsspec_storage_options_gcs_with_overrides(mock_os, mock_get_config_file): +@mock.patch("flytekit.core.data_persistence.gcsstore_from_env") +def test_get_fsspec_storage_options_gcs_with_overrides(mock_from_env, mock_os, mock_get_config_file): mock_get_config_file.return_value = None ee = { "FLYTE_GCP_GSUTIL_PARALLELISM": "False", @@ -345,12 +349,16 @@ def test_get_fsspec_storage_options_gcs_with_overrides(mock_os, mock_get_config_ storage_options = get_fsspec_storage_options( "gs", DataConfig.auto(), anonymous=True, other_argument="value" ) - assert storage_options == {"other_argument": "value"} + assert "other_argument" in storage_options + mock_from_env.return_value = mock.Mock() + mock_from_env.assert_called_with( + "", # bucket name is empty + ) @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -@mock.patch("obstore.store.AzureStore.from_env") +@mock.patch("flytekit.core.data_persistence.azurestore_from_env") def test_get_fsspec_storage_options_azure(mock_from_env, mock_os, mock_get_config_file): mock_get_config_file.return_value = None account_key = "accountkey" @@ -370,13 +378,11 @@ def test_get_fsspec_storage_options_azure(mock_from_env, mock_os, mock_get_confi mock_from_env.return_value = mock.Mock() mock_from_env.assert_called_with( "", - config={ - "account_name": "accountname", - "account_key": account_key_base64, - "client_id": "clientid", - "client_secret": "clientsecret", - "tenant_id": "tenantid", - }, + account_name = "accountname", + account_key = account_key_base64, + client_id = "clientid", + client_secret = "clientsecret", + tenant_id = "tenantid", ) diff --git a/tests/flytekit/unit/core/test_data_persistence.py b/tests/flytekit/unit/core/test_data_persistence.py index 288f7aef1d..b1d6dc9f15 100644 --- a/tests/flytekit/unit/core/test_data_persistence.py +++ b/tests/flytekit/unit/core/test_data_persistence.py @@ -189,12 +189,10 @@ def test_initialise_azure_file_provider_with_service_principal(mock_from_env): mock_from_env.return_value = mock.Mock() mock_from_env.assert_called_with( "", - config={ - "account_name": "accountname", - "client_secret": "clientsecret", - "client_id": "clientid", - "tenant_id": "tenantid", - }, + account_name = "accountname", + client_secret = "clientsecret", + client_id = "clientid", + tenant_id = "tenantid", ) @@ -212,9 +210,7 @@ def test_initialise_azure_file_provider_with_default_credential(mock_from_env): mock_from_env.return_value = mock.Mock() mock_from_env.assert_called_with( "", - config={ - "account_name": "accountname", - }, + account_name = "accountname", ) From 222df10e95d3ed53e0a3eaef8f6a77cd0a81bcd6 Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 4 Mar 2025 22:06:49 +0800 Subject: [PATCH 25/42] feat: adapt usage of newest obstore version Signed-off-by: machichima --- flytekit/core/data_persistence.py | 315 +++++++++--------- flytekit/core/obstore_filesystem.py | 36 -- flytekit/types/structured/basic_dfs.py | 4 +- .../flytekitplugins/polars/sd_transformers.py | 6 +- 4 files changed, 152 insertions(+), 209 deletions(-) delete mode 100644 flytekit/core/obstore_filesystem.py diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index 033b124a32..5e80d3e364 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -36,16 +36,12 @@ from fsspec.utils import get_protocol from obstore.exceptions import GenericError from obstore.store import AzureStore, GCSStore, S3Store +from obstore.fsspec import register from typing_extensions import Unpack from flytekit import configuration from flytekit.configuration import DataConfig from flytekit.core.local_fsspec import FlyteLocalFileSystem -from flytekit.core.obstore_filesystem import ( - ObstoreAzureBlobFileSystem, - ObstoreGCSFileSystem, - ObstoreS3FileSystem, -) from flytekit.core.utils import timeit from flytekit.exceptions.system import ( FlyteDownloadDataException, @@ -62,22 +58,38 @@ _FSSPEC_S3_SECRET = "secret_access_key" _ANON = "skip_signature" -Uploadable = typing.Union[str, os.PathLike, pathlib.Path, bytes, io.BufferedReader, io.BytesIO, io.StringIO] +Uploadable = typing.Union[ + str, os.PathLike, pathlib.Path, bytes, io.BufferedReader, io.BytesIO, io.StringIO +] # This is the default chunk size flytekit will use for writing to S3 and GCS. This is set to 25MB by default and is # configurable by the user if needed. This is used when put() is called on filesystems. -_WRITE_SIZE_CHUNK_BYTES = int(os.environ.get("_F_P_WRITE_CHUNK_SIZE", "26214400")) # 25 * 2**20 - - -@lru_cache -def s3store_from_env(bucket: str, retries: int, backoff: timedelta, **store_kwargs) -> S3Store: - # Config refer to https://developmentseed.org/obstore/latest/api/store/aws/#obstore.store.S3Store.from_env - store = S3Store.from_env( - bucket, - **store_kwargs, - virtual_hosted_style_request=False, # not include bucket name in the domain name - client_options={"timeout": "99999s", "allow_http": True}, - retry_config={ +_WRITE_SIZE_CHUNK_BYTES = int( + os.environ.get("_F_P_WRITE_CHUNK_SIZE", "26214400") +) # 25 * 2**20 + + +def s3_setup_args( + s3_cfg: configuration.S3Config, anonymous: bool = False, **kwargs +) -> Dict[str, Any]: + """ + Setup s3 storage, bucket is needed to create obstore store object + """ + + config: Dict[str, Any] = {} + + config[_FSSPEC_S3_KEY_ID] = kwargs.pop(_FSSPEC_S3_KEY_ID, s3_cfg.access_key_id) + config[_FSSPEC_S3_SECRET] = kwargs.pop(_FSSPEC_S3_SECRET, s3_cfg.secret_access_key) + config["endpoint_url"] = kwargs.pop("endpoint_url", s3_cfg.endpoint) + + retries = kwargs.pop("retries", s3_cfg.retries) + backoff = kwargs.pop("backoff", s3_cfg.backoff) + + if anonymous: + kwargs[_ANON] = "true" + + retry_config = ( + { "max_retries": retries, "backoff": { "base": 2, @@ -88,136 +100,58 @@ def s3store_from_env(bucket: str, retries: int, backoff: timedelta, **store_kwar }, ) - return store - + client_options = {"timeout": "99999s", "allow_http": "true"} -@lru_cache -def gcsstore_from_env(bucket: str) -> GCSStore: - store = GCSStore.from_env(bucket) - return store - - -@lru_cache -def azurestore_from_env(container: str, **store_kwargs) -> AzureStore: - # Config refer to https://developmentseed.org/obstore/latest/api/store/azure/#obstore.store.AzureStore.from_env - store = AzureStore.from_env(container, **store_kwargs) - return store - - -def s3_setup_args( - s3_cfg: configuration.S3Config, bucket: str = "", anonymous: bool = False, **kwargs -) -> Dict[str, Any]: - """ - Setup s3 storage, bucket is needed to create obstore store object - """ - store_kwargs: Dict[str, Any] = {} - - if _FSSPEC_S3_KEY_ID in kwargs or s3_cfg.access_key_id is not None: - store_kwargs[_FSSPEC_S3_KEY_ID] = kwargs.get(_FSSPEC_S3_KEY_ID, s3_cfg.access_key_id) - if _FSSPEC_S3_SECRET in kwargs or s3_cfg.secret_access_key is not None: - store_kwargs[_FSSPEC_S3_SECRET] = kwargs.get(_FSSPEC_S3_SECRET, s3_cfg.secret_access_key) - if "endpoint_url" in kwargs or s3_cfg.endpoint is not None: - store_kwargs["endpoint_url"] = kwargs.get("endpoint_url", s3_cfg.endpoint) - if "retries" in kwargs or s3_cfg.retries is not None: - store_kwargs["retries"] = kwargs.get("retries", s3_cfg.retries) - if "backoff" in kwargs or s3_cfg.backoff is not None: - store_kwargs["backoff"] = kwargs.get("backoff", s3_cfg.backoff) - if anonymous: - store_kwargs[_ANON] = "true" - - store = s3store_from_env(bucket, **store_kwargs) - - kwargs["store"] = store + kwargs["config"] = config + kwargs["client_options"] = client_options + kwargs["retry_config"] = retry_config return kwargs -def gs_setup_args(gcs_cfg: configuration.GCSConfig, bucket: str = "", anonymous: bool = False) -> Dict[str, Any]: +def gs_setup_args( + gcs_cfg: configuration.GCSConfig, anonymous: bool = False +) -> Dict[str, Any]: """ Setup gcs storage, bucket is needed to create obstore store object """ kwargs: Dict[str, Any] = {} - store = gcsstore_from_env(bucket) - - kwargs["store"] = store - return kwargs def azure_setup_args( azure_cfg: configuration.AzureBlobStorageConfig, - bucket: str = "", anonymous: bool = False, **kwargs, ) -> Dict[str, Any]: """ Setup azure blob storage, bucket is needed to create obstore store object """ - store_kwargs: Dict[str, Any] = {} - - if "account_name" in kwargs or azure_cfg.account_name is not None: - store_kwargs["account_name"] = kwargs.get("account_name", azure_cfg.account_name) - if "account_key" in kwargs or azure_cfg.account_key is not None: - store_kwargs["account_key"] = kwargs.get("account_key", azure_cfg.account_key) - if "client_id" in kwargs or azure_cfg.client_id is not None: - store_kwargs["client_id"] = kwargs.get("client_id", azure_cfg.client_id) - if "client_secret" in kwargs or azure_cfg.client_secret is not None: - store_kwargs["client_secret"] = kwargs.get("client_secret", azure_cfg.client_secret) - if "tenant_id" in kwargs or azure_cfg.tenant_id is not None: - store_kwargs["tenant_id"] = kwargs.get("tenant_id", azure_cfg.tenant_id) - if anonymous: - store_kwargs[_ANON] = "true" - store = azurestore_from_env(bucket, **store_kwargs) + config: Dict[str, Any] = {} - kwargs["store"] = store - - return kwargs + config["account_name"] = kwargs.get("account_name", azure_cfg.account_name) + config["account_key"] = kwargs.get("account_key", azure_cfg.account_key) + config["client_id"] = kwargs.get("client_id", azure_cfg.client_id) + config["client_secret"] = kwargs.get("client_secret", azure_cfg.client_secret) + config["tenant_id"] = kwargs.get("tenant_id", azure_cfg.tenant_id) + if anonymous: + kwargs[_ANON] = "true" -def split_path(path: str) -> Tuple[str, str]: - """ - Split bucket and file path + client_options = {"timeout": "99999s", "allow_http": "true"} - Parameters - ---------- - path : string - Input path, like `s3://mybucket/path/to/file` + kwargs["config"] = config + kwargs["client_options"] = client_options - Examples - -------- - >>> split_path("s3://mybucket/path/to/file") - ['mybucket', 'path/to/file'] - """ - support_types = ["s3", "gs", "abfs"] - protocol = get_protocol(path) - if protocol not in support_types: - # no bucket for file - return "", path - - if path.startswith(protocol + "://"): - path = path[len(protocol) + 3 :] - elif path.startswith(protocol + "::"): - path = path[len(protocol) + 2 :] - path = path.strip("/") - - if "/" not in path: - return path, "" - else: - path_li = path.split("/") - bucket = path_li[0] - # use obstore for s3 and gcs only now, no need to split - # bucket out of path for other storage - file_path = "/".join(path_li[1:]) - return (bucket, file_path) + return kwargs def get_fsspec_storage_options( protocol: str, data_config: typing.Optional[DataConfig] = None, anonymous: bool = False, - bucket: str = "", **kwargs, ) -> Dict[str, Any]: data_config = data_config or DataConfig.auto() @@ -226,20 +160,22 @@ def get_fsspec_storage_options( return {"auto_mkdir": True, **kwargs} if protocol == "s3": return { - **s3_setup_args(data_config.s3, bucket, anonymous=anonymous, **kwargs), + **s3_setup_args(data_config.s3, anonymous=anonymous, **kwargs), **kwargs, } if protocol == "gs": - return {**gs_setup_args(data_config.gcs, bucket, anonymous=anonymous), **kwargs} + return {**gs_setup_args(data_config.gcs, anonymous=anonymous), **kwargs} if protocol in ("abfs", "abfss"): return { - **azure_setup_args(data_config.azure, bucket, anonymous=anonymous, **kwargs), + **azure_setup_args(data_config.azure, anonymous=anonymous, **kwargs), **kwargs, } return {} -def get_additional_fsspec_call_kwargs(protocol: typing.Union[str, tuple], method_name: str) -> Dict[str, Any]: +def get_additional_fsspec_call_kwargs( + protocol: typing.Union[str, tuple], method_name: str +) -> Dict[str, Any]: """ These are different from the setup args functions defined above. Those kwargs are applied when asking fsspec to create the filesystem. These kwargs returned here are for when the filesystem's methods are invoked. @@ -299,7 +235,9 @@ def __init__( """ # Local access if local_sandbox_dir is None or local_sandbox_dir == "": - raise ValueError("FileAccessProvider needs to be created with a valid local_sandbox_dir") + raise ValueError( + "FileAccessProvider needs to be created with a valid local_sandbox_dir" + ) local_sandbox_dir_appended = os.path.join(local_sandbox_dir, "local_flytekit") self._local_sandbox_dir = pathlib.Path(local_sandbox_dir_appended) self._local_sandbox_dir.mkdir(parents=True, exist_ok=True) @@ -312,7 +250,9 @@ def __init__( else: self._execution_metadata = None self._default_protocol = get_protocol(str(raw_output_prefix)) - self._default_remote = cast(fsspec.AbstractFileSystem, self.get_filesystem(self._default_protocol)) + self._default_remote = cast( + fsspec.AbstractFileSystem, self.get_filesystem(self._default_protocol) + ) if os.name == "nt" and raw_output_prefix.startswith("file://"): raise FlyteAssertion("Cannot use the file:// prefix on Windows.") self._raw_output_prefix = ( @@ -341,36 +281,39 @@ def get_filesystem( protocol: typing.Optional[str] = None, anonymous: bool = False, path: typing.Optional[str] = None, - bucket: str = "", **kwargs, ) -> fsspec.AbstractFileSystem: - # TODO: add bucket to adlfs if not protocol: return self._default_remote if protocol == "file": kwargs["auto_mkdir"] = True return FlyteLocalFileSystem(**kwargs) elif protocol == "s3": - s3kwargs = s3_setup_args(self._data_config.s3, bucket, anonymous=anonymous, **kwargs) + s3kwargs = s3_setup_args( + self._data_config.s3, anonymous=anonymous, **kwargs + ) s3kwargs.update(kwargs) return fsspec.filesystem(protocol, **s3kwargs) # type: ignore elif protocol == "gs": - gskwargs = gs_setup_args(self._data_config.gcs, bucket, anonymous=anonymous) + gskwargs = gs_setup_args(self._data_config.gcs, anonymous=anonymous) gskwargs.update(kwargs) return fsspec.filesystem(protocol, **gskwargs) # type: ignore elif protocol == "abfs": - azkwargs = azure_setup_args(self._data_config.azure, bucket, anonymous=anonymous, **kwargs) + azkwargs = azure_setup_args( + self._data_config.azure, anonymous=anonymous, **kwargs + ) azkwargs.update(kwargs) return fsspec.filesystem(protocol, **azkwargs) # type: ignore elif protocol == "ftp": - kwargs.update(fsspec.implementations.ftp.FTPFileSystem._get_kwargs_from_urls(path)) + kwargs.update( + fsspec.implementations.ftp.FTPFileSystem._get_kwargs_from_urls(path) + ) return fsspec.filesystem(protocol, **kwargs) storage_options = get_fsspec_storage_options( protocol=protocol, anonymous=anonymous, data_config=self._data_config, - bucket=bucket, **kwargs, ) kwargs.update(storage_options) @@ -378,7 +321,7 @@ def get_filesystem( return fsspec.filesystem(protocol, **kwargs) async def get_async_filesystem_for_path( - self, path: str = "", bucket: str = "", anonymous: bool = False, **kwargs + self, path: str = "", anonymous: bool = False, **kwargs ) -> Union[AsyncFileSystem, fsspec.AbstractFileSystem]: protocol = get_protocol(path) loop = asyncio.get_running_loop() @@ -387,17 +330,16 @@ async def get_async_filesystem_for_path( protocol, anonymous=anonymous, path=path, - bucket=bucket, asynchronous=True, loop=loop, **kwargs, ) def get_filesystem_for_path( - self, path: str = "", bucket: str = "", anonymous: bool = False, **kwargs + self, path: str = "", anonymous: bool = False, **kwargs ) -> fsspec.AbstractFileSystem: protocol = get_protocol(path) - return self.get_filesystem(protocol, anonymous=anonymous, path=path, bucket=bucket, **kwargs) + return self.get_filesystem(protocol, anonymous=anonymous, path=path, *kwargs) @staticmethod def is_remote(path: Union[str, os.PathLike]) -> bool: @@ -448,7 +390,9 @@ def recursive_paths(f: str, t: str) -> typing.Tuple[str, str]: def sep(self, file_system: typing.Optional[fsspec.AbstractFileSystem]) -> str: if file_system is None or file_system.protocol == "file": return os.sep - if isinstance(file_system.protocol, tuple) or isinstance(file_system.protocol, list): + if isinstance(file_system.protocol, tuple) or isinstance( + file_system.protocol, list + ): if "file" in file_system.protocol: return os.sep return file_system.sep @@ -466,9 +410,10 @@ def exists(self, path: str) -> bool: raise oe @retry_request - async def get(self, from_path: str, to_path: str, recursive: bool = False, **kwargs): - bucket, from_path_file_only = split_path(from_path) - file_system = await self.get_async_filesystem_for_path(from_path, bucket) + async def get( + self, from_path: str, to_path: str, recursive: bool = False, **kwargs + ): + file_system = await self.get_async_filesystem_for_path(from_path) if recursive: from_path, to_path = self.recursive_paths(from_path, to_path) try: @@ -482,17 +427,23 @@ async def get(self, from_path: str, to_path: str, recursive: bool = False, **kwa ) logger.info(f"Getting {from_path} to {to_path}") if isinstance(file_system, AsyncFileSystem): - dst = await file_system._get(from_path_file_only, to_path, recursive=recursive, **kwargs) # pylint: disable=W0212 + dst = await file_system._get( + from_path, to_path, recursive=recursive, **kwargs + ) # pylint: disable=W0212 else: dst = file_system.get(from_path, to_path, recursive=recursive, **kwargs) if isinstance(dst, (str, pathlib.Path)): return dst return to_path except (OSError, GenericError) as oe: - logger.debug(f"Error in getting {from_path} to {to_path} rec {recursive} {oe}") + logger.debug( + f"Error in getting {from_path} to {to_path} rec {recursive} {oe}" + ) if isinstance(file_system, AsyncFileSystem): try: - exists = await file_system._exists(from_path) # pylint: disable=W0212 + exists = await file_system._exists( + from_path + ) # pylint: disable=W0212 except GenericError: # for obstore, as it does not raise FileNotFoundError in fsspec but GenericError # force it to try get_filesystem(anonymous=True) @@ -501,23 +452,30 @@ async def get(self, from_path: str, to_path: str, recursive: bool = False, **kwa exists = file_system.exists(from_path) if not exists: raise FlyteDataNotFoundException(from_path) - file_system = self.get_filesystem(get_protocol(from_path), anonymous=True, asynchronous=True) + file_system = self.get_filesystem( + get_protocol(from_path), anonymous=True, asynchronous=True + ) if file_system is not None: logger.debug(f"Attempting anonymous get with {file_system}") if isinstance(file_system, AsyncFileSystem): - return await file_system._get(from_path, to_path, recursive=recursive, **kwargs) # pylint: disable=W0212 + return await file_system._get( + from_path, to_path, recursive=recursive, **kwargs + ) # pylint: disable=W0212 else: - return file_system.get(from_path, to_path, recursive=recursive, **kwargs) + return file_system.get( + from_path, to_path, recursive=recursive, **kwargs + ) raise oe @retry_request - async def _put(self, from_path: str, to_path: str, recursive: bool = False, **kwargs): + async def _put( + self, from_path: str, to_path: str, recursive: bool = False, **kwargs + ): """ More of an internal function to be called by put_data and put_raw_data This does not need a separate sync function. """ - bucket, to_path_file_only = split_path(to_path) - file_system = await self.get_async_filesystem_for_path(to_path, bucket) + file_system = await self.get_async_filesystem_for_path(to_path) from_path = self.strip_file_header(from_path) if recursive: # Only check this for the local filesystem @@ -537,11 +495,15 @@ async def _put(self, from_path: str, to_path: str, recursive: bool = False, **kw kwargs["metadata"] = {} kwargs["metadata"].update(self._execution_metadata) - additional_kwargs = get_additional_fsspec_call_kwargs(file_system.protocol, file_system.put.__name__) + additional_kwargs = get_additional_fsspec_call_kwargs( + file_system.protocol, file_system.put.__name__ + ) kwargs.update(additional_kwargs) if isinstance(file_system, AsyncFileSystem): - dst = await file_system._put(from_path, to_path_file_only, recursive=recursive, **kwargs) # pylint: disable=W0212 + dst = await file_system._put( + from_path, to_path, recursive=recursive, **kwargs + ) # pylint: disable=W0212 else: dst = file_system.put(from_path, to_path, recursive=recursive, **kwargs) if isinstance(dst, (str, pathlib.Path)): @@ -584,18 +546,32 @@ async def async_put_raw_data( :return: Returns the final path data was written to. """ # First figure out what the destination path should be, then call put. - upload_prefix = self.get_random_string() if upload_prefix is None else upload_prefix - to_path = self.join(self.raw_output_prefix, upload_prefix) if not skip_raw_data_prefix else upload_prefix + upload_prefix = ( + self.get_random_string() if upload_prefix is None else upload_prefix + ) + to_path = ( + self.join(self.raw_output_prefix, upload_prefix) + if not skip_raw_data_prefix + else upload_prefix + ) if file_name: to_path = self.join(to_path, file_name) else: - if isinstance(lpath, str) or isinstance(lpath, os.PathLike) or isinstance(lpath, pathlib.Path): + if ( + isinstance(lpath, str) + or isinstance(lpath, os.PathLike) + or isinstance(lpath, pathlib.Path) + ): to_path = self.join(to_path, self.get_file_tail(str(lpath))) else: to_path = self.join(to_path, self.get_random_string()) # If lpath is a file, then use put. - if isinstance(lpath, str) or isinstance(lpath, os.PathLike) or isinstance(lpath, pathlib.Path): + if ( + isinstance(lpath, str) + or isinstance(lpath, os.PathLike) + or isinstance(lpath, pathlib.Path) + ): p = pathlib.Path(lpath) from_path = str(lpath) if not p.exists(): @@ -610,13 +586,11 @@ async def async_put_raw_data( r = await self._put(from_path, to_path, **kwargs) return r or to_path - bucket, _ = split_path(to_path) - # See https://github.com/fsspec/s3fs/issues/871 for more background and pending work on the fsspec side to # support effectively async open(). For now these use-cases below will revert to sync calls. # raw bytes if isinstance(lpath, bytes): - fs = self.get_filesystem_for_path(to_path, bucket) + fs = self.get_filesystem_for_path(to_path) with fs.open(to_path, "wb", **kwargs) as s: s.write(lpath) return to_path @@ -625,7 +599,7 @@ async def async_put_raw_data( if isinstance(lpath, io.BufferedReader) or isinstance(lpath, io.BytesIO): if not lpath.readable(): raise FlyteAssertion("Buffered reader must be readable") - fs = self.get_filesystem_for_path(to_path, bucket) + fs = self.get_filesystem_for_path(to_path) lpath.seek(0) with fs.open(to_path, "wb", **kwargs) as s: while data := lpath.read(read_chunk_size_bytes): @@ -635,7 +609,7 @@ async def async_put_raw_data( if isinstance(lpath, io.StringIO): if not lpath.readable(): raise FlyteAssertion("Buffered reader must be readable") - fs = self.get_filesystem_for_path(to_path, bucket) + fs = self.get_filesystem_for_path(to_path) lpath.seek(0) with fs.open(to_path, "wb", **kwargs) as s: while data_str := lpath.read(read_chunk_size_bytes): @@ -668,7 +642,9 @@ def join( raise ValueError("Must provide at least one argument") base, tails = args[0], list(args[1:]) if get_protocol(base) not in str(fs.protocol): - logger.warning(f"joining {base} with incorrect fs {fs.protocol} vs {get_protocol(base)}") + logger.warning( + f"joining {base} with incorrect fs {fs.protocol} vs {get_protocol(base)}" + ) if base.endswith(fs.sep): # noqa base = base[:-1] l = [base] @@ -711,7 +687,9 @@ def generate_new_custom_path( p = fs.sep.join(s_pref) return p - def get_random_local_path(self, file_path_or_file_name: typing.Optional[str] = None) -> str: + def get_random_local_path( + self, file_path_or_file_name: typing.Optional[str] = None + ) -> str: """ Use file_path_or_file_name, when you want a random directory, but want to preserve the leaf file name """ @@ -728,7 +706,9 @@ def get_random_local_directory(self) -> str: pathlib.Path(_dir).mkdir(parents=True, exist_ok=True) return _dir - def get_random_remote_path(self, file_path_or_file_name: typing.Optional[str] = None) -> str: + def get_random_remote_path( + self, file_path_or_file_name: typing.Optional[str] = None + ) -> str: if file_path_or_file_name: return self.join( self.raw_output_prefix, @@ -772,7 +752,9 @@ def upload_directory(self, local_path: str, remote_path: str, **kwargs): """ return self.put_data(local_path, remote_path, is_multipart=True, **kwargs) - async def async_get_data(self, remote_path: str, local_path: str, is_multipart: bool = False, **kwargs): + async def async_get_data( + self, remote_path: str, local_path: str, is_multipart: bool = False, **kwargs + ): """ :param remote_path: :param local_path: @@ -781,7 +763,9 @@ async def async_get_data(self, remote_path: str, local_path: str, is_multipart: try: pathlib.Path(local_path).parent.mkdir(parents=True, exist_ok=True) with timeit(f"Download data to local from {remote_path}"): - await self.get(remote_path, to_path=local_path, recursive=is_multipart, **kwargs) + await self.get( + remote_path, to_path=local_path, recursive=is_multipart, **kwargs + ) except FlyteDataNotFoundException: raise except Exception as ex: @@ -810,7 +794,9 @@ async def async_put_data( try: local_path = str(local_path) with timeit(f"Upload data to {remote_path}"): - put_result = await self._put(cast(str, local_path), remote_path, recursive=is_multipart, **kwargs) + put_result = await self._put( + cast(str, local_path), remote_path, recursive=is_multipart, **kwargs + ) # This is an unfortunate workaround to ensure that we return the correct path for the remote location # Callers of this put_data function in flytekit have been changed to assign the remote path to the # output @@ -828,9 +814,8 @@ async def async_put_data( put_data = loop_manager.synced(async_put_data) -fsspec.register_implementation("s3", ObstoreS3FileSystem) -fsspec.register_implementation("gs", ObstoreGCSFileSystem) -fsspec.register_implementation("abfs", ObstoreAzureBlobFileSystem) +register(["s3", "gs", "abfs"]) + flyte_tmp_dir = tempfile.mkdtemp(prefix="flyte-") default_local_file_access_provider = FileAccessProvider( diff --git a/flytekit/core/obstore_filesystem.py b/flytekit/core/obstore_filesystem.py deleted file mode 100644 index b82e8a6d1c..0000000000 --- a/flytekit/core/obstore_filesystem.py +++ /dev/null @@ -1,36 +0,0 @@ -""" -Classes that overrides the AsyncFsspecStore that specify the filesystem specific parameters -""" - -from obstore.fsspec import AsyncFsspecStore - -DEFAULT_BLOCK_SIZE = 5 * 2**20 - - -class ObstoreS3FileSystem(AsyncFsspecStore): - """ - Add following property used in S3FileSystem - """ - - root_marker = "" - blocksize = DEFAULT_BLOCK_SIZE - protocol = ("s3", "s3a") - _extra_tokenize_attributes = ("default_block_size",) - - -class ObstoreGCSFileSystem(AsyncFsspecStore): - """ - Add following property used in GCSFileSystem - """ - - scopes = {"read_only", "read_write", "full_control"} - blocksize = DEFAULT_BLOCK_SIZE - protocol = "gcs", "gs" - - -class ObstoreAzureBlobFileSystem(AsyncFsspecStore): - """ - Add following property used in AzureBlobFileSystem - """ - - protocol = "abfs" diff --git a/flytekit/types/structured/basic_dfs.py b/flytekit/types/structured/basic_dfs.py index 0a44cead59..88a7896d24 100644 --- a/flytekit/types/structured/basic_dfs.py +++ b/flytekit/types/structured/basic_dfs.py @@ -9,7 +9,7 @@ from flytekit import FlyteContext, lazy_module, logger from flytekit.configuration import DataConfig -from flytekit.core.data_persistence import get_fsspec_storage_options, split_path +from flytekit.core.data_persistence import get_fsspec_storage_options from flytekit.models import literals from flytekit.models.literals import StructuredDatasetMetadata from flytekit.models.types import StructuredDatasetType @@ -37,12 +37,10 @@ def get_pandas_storage_options( from pandas.io.common import is_fsspec_url if is_fsspec_url(uri): - bucket, _ = split_path(uri) return get_fsspec_storage_options( protocol=get_protocol(uri), data_config=data_config, anonymous=anonymous, - bucket=bucket, ) # Pandas does not allow storage_options for non-fsspec paths e.g. local. diff --git a/plugins/flytekit-polars/flytekitplugins/polars/sd_transformers.py b/plugins/flytekit-polars/flytekitplugins/polars/sd_transformers.py index e749eec660..1924f57d84 100644 --- a/plugins/flytekit-polars/flytekitplugins/polars/sd_transformers.py +++ b/plugins/flytekit-polars/flytekitplugins/polars/sd_transformers.py @@ -2,7 +2,7 @@ import typing from flytekit import FlyteContext, lazy_module -from flytekit.core.data_persistence import get_fsspec_storage_options, split_path +from flytekit.core.data_persistence import get_fsspec_storage_options from flytekit.models import literals from flytekit.models.literals import StructuredDatasetMetadata from flytekit.models.types import StructuredDatasetType @@ -91,11 +91,9 @@ def decode( current_task_metadata: StructuredDatasetMetadata, ) -> pl.DataFrame: uri = flyte_value.uri - bucket, _ = split_path(uri) kwargs = get_fsspec_storage_options( protocol=fsspec_utils.get_protocol(uri), data_config=ctx.file_access.data_config, - bucket=bucket, ) if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns: columns = [c.name for c in current_task_metadata.structured_dataset_type.columns] @@ -154,11 +152,9 @@ def decode( current_task_metadata: StructuredDatasetMetadata, ) -> pl.LazyFrame: uri = flyte_value.uri - bucket, _ = split_path(uri) kwargs = get_fsspec_storage_options( protocol=fsspec_utils.get_protocol(uri), data_config=ctx.file_access.data_config, - bucket=bucket, ) # use read_parquet instead of scan_parquet for now because scan_parquet currently doesn't work with fsspec: # https://github.com/pola-rs/polars/issues/16737 From e40fedf922bdb4d66eb0c693583154824123b0c6 Mon Sep 17 00:00:00 2001 From: machichima Date: Thu, 6 Mar 2025 19:57:07 +0800 Subject: [PATCH 26/42] fix: update storage config Signed-off-by: machichima --- flytekit/core/data_persistence.py | 60 ++++++++++++++++++------------- pyproject.toml | 2 +- 2 files changed, 36 insertions(+), 26 deletions(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index 5e80d3e364..370f4cc31d 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -78,33 +78,37 @@ def s3_setup_args( config: Dict[str, Any] = {} - config[_FSSPEC_S3_KEY_ID] = kwargs.pop(_FSSPEC_S3_KEY_ID, s3_cfg.access_key_id) - config[_FSSPEC_S3_SECRET] = kwargs.pop(_FSSPEC_S3_SECRET, s3_cfg.secret_access_key) - config["endpoint_url"] = kwargs.pop("endpoint_url", s3_cfg.endpoint) + if _FSSPEC_S3_KEY_ID in kwargs or s3_cfg.access_key_id: + config[_FSSPEC_S3_KEY_ID] = kwargs.pop(_FSSPEC_S3_KEY_ID, s3_cfg.access_key_id) + if _FSSPEC_S3_SECRET in kwargs or s3_cfg.secret_access_key: + config[_FSSPEC_S3_SECRET] = kwargs.pop( + _FSSPEC_S3_SECRET, s3_cfg.secret_access_key + ) + if "endpoint_url" in kwargs or s3_cfg.endpoint: + config["endpoint_url"] = kwargs.pop("endpoint_url", s3_cfg.endpoint) retries = kwargs.pop("retries", s3_cfg.retries) backoff = kwargs.pop("backoff", s3_cfg.backoff) if anonymous: - kwargs[_ANON] = "true" - - retry_config = ( - { - "max_retries": retries, - "backoff": { - "base": 2, - "init_backoff": backoff, - "max_backoff": timedelta(seconds=16), - }, - "retry_timeout": timedelta(minutes=3), + kwargs[_ANON] = True + + retry_config = { + "max_retries": retries, + "backoff": { + "base": 2, + "init_backoff": backoff, + "max_backoff": timedelta(seconds=16), }, - ) + "retry_timeout": timedelta(minutes=3), + } client_options = {"timeout": "99999s", "allow_http": "true"} - kwargs["config"] = config - kwargs["client_options"] = client_options - kwargs["retry_config"] = retry_config + if config: + kwargs["config"] = config + kwargs["client_options"] = client_options or None + kwargs["retry_config"] = retry_config or None return kwargs @@ -131,18 +135,24 @@ def azure_setup_args( config: Dict[str, Any] = {} - config["account_name"] = kwargs.get("account_name", azure_cfg.account_name) - config["account_key"] = kwargs.get("account_key", azure_cfg.account_key) - config["client_id"] = kwargs.get("client_id", azure_cfg.client_id) - config["client_secret"] = kwargs.get("client_secret", azure_cfg.client_secret) - config["tenant_id"] = kwargs.get("tenant_id", azure_cfg.tenant_id) + if "account_name" in kwargs or azure_cfg.account_name: + config["account_name"] = kwargs.get("account_name", azure_cfg.account_name) + if "account_key" in kwargs or azure_cfg.account_key: + config["account_key"] = kwargs.get("account_key", azure_cfg.account_key) + if "client_id" in kwargs or azure_cfg.client_id: + config["client_id"] = kwargs.get("client_id", azure_cfg.client_id) + if "client_secret" in kwargs or azure_cfg.client_secret: + config["client_secret"] = kwargs.get("client_secret", azure_cfg.client_secret) + if "tenant_id" in kwargs or azure_cfg.tenant_id: + config["tenant_id"] = kwargs.get("tenant_id", azure_cfg.tenant_id) if anonymous: - kwargs[_ANON] = "true" + kwargs[_ANON] = True client_options = {"timeout": "99999s", "allow_http": "true"} - kwargs["config"] = config + if config: + kwargs["config"] = config kwargs["client_options"] = client_options return kwargs diff --git a/pyproject.toml b/pyproject.toml index 60b51109c5..d471779c5a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ dependencies = [ "marshmallow-jsonschema>=0.12.0", "mashumaro>=3.15", "msgpack>=1.1.0", - "obstore==0.3.0b10", + "obstore", "protobuf!=4.25.0", "pygments", "python-json-logger>=2.0.0", From 4dcb8a34a39e1033badae0aafa5b45b5151b036b Mon Sep 17 00:00:00 2001 From: machichima Date: Thu, 6 Mar 2025 21:43:36 +0800 Subject: [PATCH 27/42] style: lint Signed-off-by: machichima --- flytekit/core/data_persistence.py | 148 ++++++++---------------------- 1 file changed, 37 insertions(+), 111 deletions(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index 370f4cc31d..74322e8f20 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -25,9 +25,8 @@ import tempfile import typing from datetime import timedelta -from functools import lru_cache from time import sleep -from typing import Any, Dict, Optional, Tuple, Union, cast +from typing import Any, Dict, Optional, Union, cast from uuid import UUID import fsspec @@ -35,7 +34,6 @@ from fsspec.asyn import AsyncFileSystem from fsspec.utils import get_protocol from obstore.exceptions import GenericError -from obstore.store import AzureStore, GCSStore, S3Store from obstore.fsspec import register from typing_extensions import Unpack @@ -58,20 +56,14 @@ _FSSPEC_S3_SECRET = "secret_access_key" _ANON = "skip_signature" -Uploadable = typing.Union[ - str, os.PathLike, pathlib.Path, bytes, io.BufferedReader, io.BytesIO, io.StringIO -] +Uploadable = typing.Union[str, os.PathLike, pathlib.Path, bytes, io.BufferedReader, io.BytesIO, io.StringIO] # This is the default chunk size flytekit will use for writing to S3 and GCS. This is set to 25MB by default and is # configurable by the user if needed. This is used when put() is called on filesystems. -_WRITE_SIZE_CHUNK_BYTES = int( - os.environ.get("_F_P_WRITE_CHUNK_SIZE", "26214400") -) # 25 * 2**20 +_WRITE_SIZE_CHUNK_BYTES = int(os.environ.get("_F_P_WRITE_CHUNK_SIZE", "26214400")) # 25 * 2**20 -def s3_setup_args( - s3_cfg: configuration.S3Config, anonymous: bool = False, **kwargs -) -> Dict[str, Any]: +def s3_setup_args(s3_cfg: configuration.S3Config, anonymous: bool = False, **kwargs) -> Dict[str, Any]: """ Setup s3 storage, bucket is needed to create obstore store object """ @@ -81,9 +73,7 @@ def s3_setup_args( if _FSSPEC_S3_KEY_ID in kwargs or s3_cfg.access_key_id: config[_FSSPEC_S3_KEY_ID] = kwargs.pop(_FSSPEC_S3_KEY_ID, s3_cfg.access_key_id) if _FSSPEC_S3_SECRET in kwargs or s3_cfg.secret_access_key: - config[_FSSPEC_S3_SECRET] = kwargs.pop( - _FSSPEC_S3_SECRET, s3_cfg.secret_access_key - ) + config[_FSSPEC_S3_SECRET] = kwargs.pop(_FSSPEC_S3_SECRET, s3_cfg.secret_access_key) if "endpoint_url" in kwargs or s3_cfg.endpoint: config["endpoint_url"] = kwargs.pop("endpoint_url", s3_cfg.endpoint) @@ -91,7 +81,7 @@ def s3_setup_args( backoff = kwargs.pop("backoff", s3_cfg.backoff) if anonymous: - kwargs[_ANON] = True + config[_ANON] = True retry_config = { "max_retries": retries, @@ -113,9 +103,7 @@ def s3_setup_args( return kwargs -def gs_setup_args( - gcs_cfg: configuration.GCSConfig, anonymous: bool = False -) -> Dict[str, Any]: +def gs_setup_args(gcs_cfg: configuration.GCSConfig, anonymous: bool = False) -> Dict[str, Any]: """ Setup gcs storage, bucket is needed to create obstore store object """ @@ -147,7 +135,7 @@ def azure_setup_args( config["tenant_id"] = kwargs.get("tenant_id", azure_cfg.tenant_id) if anonymous: - kwargs[_ANON] = True + config[_ANON] = True client_options = {"timeout": "99999s", "allow_http": "true"} @@ -183,9 +171,7 @@ def get_fsspec_storage_options( return {} -def get_additional_fsspec_call_kwargs( - protocol: typing.Union[str, tuple], method_name: str -) -> Dict[str, Any]: +def get_additional_fsspec_call_kwargs(protocol: typing.Union[str, tuple], method_name: str) -> Dict[str, Any]: """ These are different from the setup args functions defined above. Those kwargs are applied when asking fsspec to create the filesystem. These kwargs returned here are for when the filesystem's methods are invoked. @@ -245,9 +231,7 @@ def __init__( """ # Local access if local_sandbox_dir is None or local_sandbox_dir == "": - raise ValueError( - "FileAccessProvider needs to be created with a valid local_sandbox_dir" - ) + raise ValueError("FileAccessProvider needs to be created with a valid local_sandbox_dir") local_sandbox_dir_appended = os.path.join(local_sandbox_dir, "local_flytekit") self._local_sandbox_dir = pathlib.Path(local_sandbox_dir_appended) self._local_sandbox_dir.mkdir(parents=True, exist_ok=True) @@ -260,9 +244,7 @@ def __init__( else: self._execution_metadata = None self._default_protocol = get_protocol(str(raw_output_prefix)) - self._default_remote = cast( - fsspec.AbstractFileSystem, self.get_filesystem(self._default_protocol) - ) + self._default_remote = cast(fsspec.AbstractFileSystem, self.get_filesystem(self._default_protocol)) if os.name == "nt" and raw_output_prefix.startswith("file://"): raise FlyteAssertion("Cannot use the file:// prefix on Windows.") self._raw_output_prefix = ( @@ -299,9 +281,7 @@ def get_filesystem( kwargs["auto_mkdir"] = True return FlyteLocalFileSystem(**kwargs) elif protocol == "s3": - s3kwargs = s3_setup_args( - self._data_config.s3, anonymous=anonymous, **kwargs - ) + s3kwargs = s3_setup_args(self._data_config.s3, anonymous=anonymous, **kwargs) s3kwargs.update(kwargs) return fsspec.filesystem(protocol, **s3kwargs) # type: ignore elif protocol == "gs": @@ -309,15 +289,11 @@ def get_filesystem( gskwargs.update(kwargs) return fsspec.filesystem(protocol, **gskwargs) # type: ignore elif protocol == "abfs": - azkwargs = azure_setup_args( - self._data_config.azure, anonymous=anonymous, **kwargs - ) + azkwargs = azure_setup_args(self._data_config.azure, anonymous=anonymous, **kwargs) azkwargs.update(kwargs) return fsspec.filesystem(protocol, **azkwargs) # type: ignore elif protocol == "ftp": - kwargs.update( - fsspec.implementations.ftp.FTPFileSystem._get_kwargs_from_urls(path) - ) + kwargs.update(fsspec.implementations.ftp.FTPFileSystem._get_kwargs_from_urls(path)) return fsspec.filesystem(protocol, **kwargs) storage_options = get_fsspec_storage_options( @@ -345,11 +321,9 @@ async def get_async_filesystem_for_path( **kwargs, ) - def get_filesystem_for_path( - self, path: str = "", anonymous: bool = False, **kwargs - ) -> fsspec.AbstractFileSystem: + def get_filesystem_for_path(self, path: str = "", anonymous: bool = False, **kwargs) -> fsspec.AbstractFileSystem: protocol = get_protocol(path) - return self.get_filesystem(protocol, anonymous=anonymous, path=path, *kwargs) + return self.get_filesystem(protocol, anonymous=anonymous, path=path, **kwargs) @staticmethod def is_remote(path: Union[str, os.PathLike]) -> bool: @@ -400,9 +374,7 @@ def recursive_paths(f: str, t: str) -> typing.Tuple[str, str]: def sep(self, file_system: typing.Optional[fsspec.AbstractFileSystem]) -> str: if file_system is None or file_system.protocol == "file": return os.sep - if isinstance(file_system.protocol, tuple) or isinstance( - file_system.protocol, list - ): + if isinstance(file_system.protocol, tuple) or isinstance(file_system.protocol, list): if "file" in file_system.protocol: return os.sep return file_system.sep @@ -420,9 +392,7 @@ def exists(self, path: str) -> bool: raise oe @retry_request - async def get( - self, from_path: str, to_path: str, recursive: bool = False, **kwargs - ): + async def get(self, from_path: str, to_path: str, recursive: bool = False, **kwargs): file_system = await self.get_async_filesystem_for_path(from_path) if recursive: from_path, to_path = self.recursive_paths(from_path, to_path) @@ -437,23 +407,17 @@ async def get( ) logger.info(f"Getting {from_path} to {to_path}") if isinstance(file_system, AsyncFileSystem): - dst = await file_system._get( - from_path, to_path, recursive=recursive, **kwargs - ) # pylint: disable=W0212 + dst = await file_system._get(from_path, to_path, recursive=recursive, **kwargs) # pylint: disable=W0212 else: dst = file_system.get(from_path, to_path, recursive=recursive, **kwargs) if isinstance(dst, (str, pathlib.Path)): return dst return to_path except (OSError, GenericError) as oe: - logger.debug( - f"Error in getting {from_path} to {to_path} rec {recursive} {oe}" - ) + logger.debug(f"Error in getting {from_path} to {to_path} rec {recursive} {oe}") if isinstance(file_system, AsyncFileSystem): try: - exists = await file_system._exists( - from_path - ) # pylint: disable=W0212 + exists = await file_system._exists(from_path) # pylint: disable=W0212 except GenericError: # for obstore, as it does not raise FileNotFoundError in fsspec but GenericError # force it to try get_filesystem(anonymous=True) @@ -462,25 +426,17 @@ async def get( exists = file_system.exists(from_path) if not exists: raise FlyteDataNotFoundException(from_path) - file_system = self.get_filesystem( - get_protocol(from_path), anonymous=True, asynchronous=True - ) + file_system = self.get_filesystem(get_protocol(from_path), anonymous=True, asynchronous=True) if file_system is not None: logger.debug(f"Attempting anonymous get with {file_system}") if isinstance(file_system, AsyncFileSystem): - return await file_system._get( - from_path, to_path, recursive=recursive, **kwargs - ) # pylint: disable=W0212 + return await file_system._get(from_path, to_path, recursive=recursive, **kwargs) # pylint: disable=W0212 else: - return file_system.get( - from_path, to_path, recursive=recursive, **kwargs - ) + return file_system.get(from_path, to_path, recursive=recursive, **kwargs) raise oe @retry_request - async def _put( - self, from_path: str, to_path: str, recursive: bool = False, **kwargs - ): + async def _put(self, from_path: str, to_path: str, recursive: bool = False, **kwargs): """ More of an internal function to be called by put_data and put_raw_data This does not need a separate sync function. @@ -505,15 +461,11 @@ async def _put( kwargs["metadata"] = {} kwargs["metadata"].update(self._execution_metadata) - additional_kwargs = get_additional_fsspec_call_kwargs( - file_system.protocol, file_system.put.__name__ - ) + additional_kwargs = get_additional_fsspec_call_kwargs(file_system.protocol, file_system.put.__name__) kwargs.update(additional_kwargs) if isinstance(file_system, AsyncFileSystem): - dst = await file_system._put( - from_path, to_path, recursive=recursive, **kwargs - ) # pylint: disable=W0212 + dst = await file_system._put(from_path, to_path, recursive=recursive, **kwargs) # pylint: disable=W0212 else: dst = file_system.put(from_path, to_path, recursive=recursive, **kwargs) if isinstance(dst, (str, pathlib.Path)): @@ -556,32 +508,18 @@ async def async_put_raw_data( :return: Returns the final path data was written to. """ # First figure out what the destination path should be, then call put. - upload_prefix = ( - self.get_random_string() if upload_prefix is None else upload_prefix - ) - to_path = ( - self.join(self.raw_output_prefix, upload_prefix) - if not skip_raw_data_prefix - else upload_prefix - ) + upload_prefix = self.get_random_string() if upload_prefix is None else upload_prefix + to_path = self.join(self.raw_output_prefix, upload_prefix) if not skip_raw_data_prefix else upload_prefix if file_name: to_path = self.join(to_path, file_name) else: - if ( - isinstance(lpath, str) - or isinstance(lpath, os.PathLike) - or isinstance(lpath, pathlib.Path) - ): + if isinstance(lpath, str) or isinstance(lpath, os.PathLike) or isinstance(lpath, pathlib.Path): to_path = self.join(to_path, self.get_file_tail(str(lpath))) else: to_path = self.join(to_path, self.get_random_string()) # If lpath is a file, then use put. - if ( - isinstance(lpath, str) - or isinstance(lpath, os.PathLike) - or isinstance(lpath, pathlib.Path) - ): + if isinstance(lpath, str) or isinstance(lpath, os.PathLike) or isinstance(lpath, pathlib.Path): p = pathlib.Path(lpath) from_path = str(lpath) if not p.exists(): @@ -652,9 +590,7 @@ def join( raise ValueError("Must provide at least one argument") base, tails = args[0], list(args[1:]) if get_protocol(base) not in str(fs.protocol): - logger.warning( - f"joining {base} with incorrect fs {fs.protocol} vs {get_protocol(base)}" - ) + logger.warning(f"joining {base} with incorrect fs {fs.protocol} vs {get_protocol(base)}") if base.endswith(fs.sep): # noqa base = base[:-1] l = [base] @@ -697,9 +633,7 @@ def generate_new_custom_path( p = fs.sep.join(s_pref) return p - def get_random_local_path( - self, file_path_or_file_name: typing.Optional[str] = None - ) -> str: + def get_random_local_path(self, file_path_or_file_name: typing.Optional[str] = None) -> str: """ Use file_path_or_file_name, when you want a random directory, but want to preserve the leaf file name """ @@ -716,9 +650,7 @@ def get_random_local_directory(self) -> str: pathlib.Path(_dir).mkdir(parents=True, exist_ok=True) return _dir - def get_random_remote_path( - self, file_path_or_file_name: typing.Optional[str] = None - ) -> str: + def get_random_remote_path(self, file_path_or_file_name: typing.Optional[str] = None) -> str: if file_path_or_file_name: return self.join( self.raw_output_prefix, @@ -762,9 +694,7 @@ def upload_directory(self, local_path: str, remote_path: str, **kwargs): """ return self.put_data(local_path, remote_path, is_multipart=True, **kwargs) - async def async_get_data( - self, remote_path: str, local_path: str, is_multipart: bool = False, **kwargs - ): + async def async_get_data(self, remote_path: str, local_path: str, is_multipart: bool = False, **kwargs): """ :param remote_path: :param local_path: @@ -773,9 +703,7 @@ async def async_get_data( try: pathlib.Path(local_path).parent.mkdir(parents=True, exist_ok=True) with timeit(f"Download data to local from {remote_path}"): - await self.get( - remote_path, to_path=local_path, recursive=is_multipart, **kwargs - ) + await self.get(remote_path, to_path=local_path, recursive=is_multipart, **kwargs) except FlyteDataNotFoundException: raise except Exception as ex: @@ -804,9 +732,7 @@ async def async_put_data( try: local_path = str(local_path) with timeit(f"Upload data to {remote_path}"): - put_result = await self._put( - cast(str, local_path), remote_path, recursive=is_multipart, **kwargs - ) + put_result = await self._put(cast(str, local_path), remote_path, recursive=is_multipart, **kwargs) # This is an unfortunate workaround to ensure that we return the correct path for the remote location # Callers of this put_data function in flytekit have been changed to assign the remote path to the # output From eb27d768fb18e5d42fc783a45286bddf8975d2cf Mon Sep 17 00:00:00 2001 From: machichima Date: Thu, 6 Mar 2025 21:49:00 +0800 Subject: [PATCH 28/42] test: update test Signed-off-by: machichima --- .../unit/bin/test_python_entrypoint.py | 2 +- tests/flytekit/unit/core/test_data.py | 118 ++++++------------ .../unit/core/test_data_persistence.py | 36 ++---- .../unit/core/test_flyte_directory.py | 2 +- tests/flytekit/unit/remote/test_fs_remote.py | 3 + 5 files changed, 56 insertions(+), 105 deletions(-) diff --git a/tests/flytekit/unit/bin/test_python_entrypoint.py b/tests/flytekit/unit/bin/test_python_entrypoint.py index 8388bb77c6..ae97212e82 100644 --- a/tests/flytekit/unit/bin/test_python_entrypoint.py +++ b/tests/flytekit/unit/bin/test_python_entrypoint.py @@ -428,7 +428,7 @@ def test_setup_for_fast_register(): @mock.patch("google.auth.compute_engine._metadata") def test_setup_cloud_prefix(mock_gcs): with setup_execution("s3://", checkpoint_path=None, prev_checkpoint=None) as ctx: - assert ctx.file_access._default_remote.protocol[0] == "s3" + assert ctx.file_access._default_remote.protocol == "s3" with setup_execution("gs://", checkpoint_path=None, prev_checkpoint=None) as ctx: assert "gs" in ctx.file_access._default_remote.protocol diff --git a/tests/flytekit/unit/core/test_data.py b/tests/flytekit/unit/core/test_data.py index 6cb55a6675..f69041236f 100644 --- a/tests/flytekit/unit/core/test_data.py +++ b/tests/flytekit/unit/core/test_data.py @@ -9,9 +9,8 @@ from botocore.parsers import base64 import fsspec import mock -from obstore.store import S3Store import pytest -from s3fs import S3FileSystem +from obstore.fsspec import FsspecStore from flytekit.configuration import Config, DataConfig, S3Config from flytekit.core.context_manager import FlyteContextManager, FlyteContext @@ -19,6 +18,8 @@ FileAccessProvider, get_fsspec_storage_options, s3_setup_args, + _FSSPEC_S3_KEY_ID, + _FSSPEC_S3_SECRET, ) from flytekit.core.type_engine import TypeEngine from flytekit.types.directory.types import FlyteDirectory @@ -170,16 +171,22 @@ async def test_async_file_system(): remote_path = "test:///tmp/test.py" local_path = "test.py" - class MockAsyncFileSystem(S3FileSystem): + class MockAsyncFileSystem(FsspecStore): + protocol = "test" + asynchronous = True def __init__(self, *args, **kwargs): - super().__init__(args, kwargs) + super().__init__(*args, **kwargs) async def _put_file(self, *args, **kwargs): - # s3fs._put_file returns None as well + # FsspecStore._put_file returns None as well return None + async def _isdir(self, *args, **kwargs): + # Return False indicating not directory here + return False + async def _get_file(self, *args, **kwargs): - # s3fs._get_file returns None as well + # FsspecStore._get_file returns None as well return None async def _lsdir( @@ -239,25 +246,17 @@ def test_local_provider_get_empty(): @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -@mock.patch("flytekit.core.data_persistence.s3store_from_env") -def test_s3_setup_args_env_empty(mock_from_env, mock_os, mock_get_config_file): +def test_s3_setup_args_env_empty(mock_os, mock_get_config_file): mock_get_config_file.return_value = None mock_os.get.return_value = None s3c = S3Config.auto() kwargs = s3_setup_args(s3c) - - mock_from_env.return_value = mock.Mock() - mock_from_env.assert_called_with( - "", - retries=3, - backoff=timedelta(seconds=5), - ) + assert all(key in kwargs for key in ("client_options", "retry_config")) @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -@mock.patch("flytekit.core.data_persistence.s3store_from_env") -def test_s3_setup_args_env_both(mock_from_env, mock_os, mock_get_config_file): +def test_s3_setup_args_env_both(mock_os, mock_get_config_file): mock_get_config_file.return_value = None ee = { @@ -269,20 +268,15 @@ def test_s3_setup_args_env_both(mock_from_env, mock_os, mock_get_config_file): mock_os.get.side_effect = lambda x, y: ee.get(x) kwargs = s3_setup_args(S3Config.auto()) - mock_from_env.return_value = mock.Mock() - mock_from_env.assert_called_with( - "", - retries=3, - backoff=timedelta(seconds=5), - access_key_id = "flyte", - secret_access_key = "flyte-secret", - ) + assert kwargs["config"] == { + _FSSPEC_S3_KEY_ID: "flyte", + _FSSPEC_S3_SECRET: "flyte-secret", + } @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -@mock.patch("flytekit.core.data_persistence.s3store_from_env") -def test_s3_setup_args_env_flyte(mock_from_env, mock_os, mock_get_config_file): +def test_s3_setup_args_env_flyte(mock_os, mock_get_config_file): mock_get_config_file.return_value = None ee = { "FLYTE_AWS_ACCESS_KEY_ID": "flyte", @@ -291,20 +285,15 @@ def test_s3_setup_args_env_flyte(mock_from_env, mock_os, mock_get_config_file): mock_os.get.side_effect = lambda x, y: ee.get(x) kwargs = s3_setup_args(S3Config.auto()) - mock_from_env.return_value = mock.Mock() - mock_from_env.assert_called_with( - "", - retries=3, - backoff=timedelta(seconds=5), - access_key_id = "flyte", - secret_access_key = "flyte-secret", - ) + assert kwargs["config"] == { + _FSSPEC_S3_KEY_ID: "flyte", + _FSSPEC_S3_SECRET: "flyte-secret", + } @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -@mock.patch("flytekit.core.data_persistence.s3store_from_env") -def test_s3_setup_args_env_aws(mock_from_env, mock_os, mock_get_config_file): +def test_s3_setup_args_env_aws(mock_os, mock_get_config_file): mock_get_config_file.return_value = None ee = { "AWS_ACCESS_KEY_ID": "ignore-user", @@ -312,35 +301,25 @@ def test_s3_setup_args_env_aws(mock_from_env, mock_os, mock_get_config_file): } mock_os.get.side_effect = lambda x, y: ee.get(x) kwargs = s3_setup_args(S3Config.auto()) - - mock_from_env.return_value = mock.Mock() - mock_from_env.assert_called_with( - "", - retries=3, - backoff=timedelta(seconds=5), - ) + # not explicitly in kwargs, since fsspec/boto3 will use these env vars by default + assert "config" not in kwargs @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -@mock.patch("flytekit.core.data_persistence.gcsstore_from_env") -def test_get_fsspec_storage_options_gcs(mock_from_env, mock_os, mock_get_config_file): +def test_get_fsspec_storage_options_gcs(mock_os, mock_get_config_file): mock_get_config_file.return_value = None ee = { "FLYTE_GCP_GSUTIL_PARALLELISM": "False", } mock_os.get.side_effect = lambda x, y: ee.get(x) storage_options = get_fsspec_storage_options("gs", DataConfig.auto()) - mock_from_env.return_value = mock.Mock() - mock_from_env.assert_called_with( - "", # bucket name is empty - ) + assert storage_options == {} @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -@mock.patch("flytekit.core.data_persistence.gcsstore_from_env") -def test_get_fsspec_storage_options_gcs_with_overrides(mock_from_env, mock_os, mock_get_config_file): +def test_get_fsspec_storage_options_gcs_with_overrides(mock_os, mock_get_config_file): mock_get_config_file.return_value = None ee = { "FLYTE_GCP_GSUTIL_PARALLELISM": "False", @@ -350,16 +329,11 @@ def test_get_fsspec_storage_options_gcs_with_overrides(mock_from_env, mock_os, m "gs", DataConfig.auto(), anonymous=True, other_argument="value" ) assert "other_argument" in storage_options - mock_from_env.return_value = mock.Mock() - mock_from_env.assert_called_with( - "", # bucket name is empty - ) @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -@mock.patch("flytekit.core.data_persistence.azurestore_from_env") -def test_get_fsspec_storage_options_azure(mock_from_env, mock_os, mock_get_config_file): +def test_get_fsspec_storage_options_azure(mock_os, mock_get_config_file): mock_get_config_file.return_value = None account_key = "accountkey" @@ -375,21 +349,16 @@ def test_get_fsspec_storage_options_azure(mock_from_env, mock_os, mock_get_confi mock_os.get.side_effect = lambda x, y: ee.get(x) storage_options = get_fsspec_storage_options("abfs", DataConfig.auto()) - mock_from_env.return_value = mock.Mock() - mock_from_env.assert_called_with( - "", - account_name = "accountname", - account_key = account_key_base64, - client_id = "clientid", - client_secret = "clientsecret", - tenant_id = "tenantid", - ) + assert storage_options["config"]["account_name"] == "accountname" + assert storage_options["config"]["account_key"] == account_key_base64 + assert storage_options["config"]["client_id"] == "clientid" + assert storage_options["config"]["client_secret"] == "clientsecret" + assert storage_options["config"]["tenant_id"] == "tenantid" @mock.patch("flytekit.configuration.get_config_file") @mock.patch("os.environ") -@mock.patch("flytekit.core.data_persistence.azurestore_from_env") -def test_get_fsspec_storage_options_azure_with_overrides(mock_from_env, mock_os, mock_get_config_file): +def test_get_fsspec_storage_options_azure_with_overrides(mock_os, mock_get_config_file): mock_get_config_file.return_value = None account_key = "accountkey" @@ -408,14 +377,9 @@ def test_get_fsspec_storage_options_azure_with_overrides(mock_from_env, mock_os, other_argument="value", ) - mock_from_env.return_value = mock.Mock() - mock_from_env.assert_called_with( - "", - account_name = "other_accountname", - account_key = account_key_base64, - skip_signature = "true", - ) - + assert storage_options["config"]["account_name"] == "other_accountname" + assert storage_options["config"]["account_key"] == account_key_base64 + assert storage_options["config"]["skip_signature"] == True def test_crawl_local_nt(source_folder): diff --git a/tests/flytekit/unit/core/test_data_persistence.py b/tests/flytekit/unit/core/test_data_persistence.py index e3c720734f..f83bfd6f26 100644 --- a/tests/flytekit/unit/core/test_data_persistence.py +++ b/tests/flytekit/unit/core/test_data_persistence.py @@ -9,7 +9,6 @@ import fsspec import mock import pytest -from azure.identity import ClientSecretCredential, DefaultAzureCredential from botocore.parsers import base64 from mock import AsyncMock @@ -155,8 +154,7 @@ def test_generate_new_custom_path(): assert np == "s3://foo-bucket/my-default-prefix/bar.txt" -@mock.patch("flytekit.core.data_persistence.azurestore_from_env") -def test_initialise_azure_file_provider_with_account_key(mock_from_env): +def test_initialise_azure_file_provider_with_account_key(): account_key = "accountkey" account_key_base64 = base64.b64encode(account_key.encode()).decode() @@ -166,16 +164,11 @@ def test_initialise_azure_file_provider_with_account_key(mock_from_env): ): fp = FileAccessProvider("/tmp", "abfs://container/path/within/container") - mock_from_env.return_value = mock.Mock() - mock_from_env.assert_called_with( - "", - account_name = "accountname", - account_key = account_key_base64, - ) + assert fp.get_filesystem().config["account_name"] == "accountname" + assert fp.get_filesystem().config["account_key"] == account_key_base64 -@mock.patch("obstore.store.AzureStore.from_env") -def test_initialise_azure_file_provider_with_service_principal(mock_from_env): +def test_initialise_azure_file_provider_with_service_principal(): with mock.patch.dict( os.environ, { @@ -187,18 +180,13 @@ def test_initialise_azure_file_provider_with_service_principal(mock_from_env): ): fp = FileAccessProvider("/tmp", "abfs://container/path/within/container") - mock_from_env.return_value = mock.Mock() - mock_from_env.assert_called_with( - "", - account_name = "accountname", - client_secret = "clientsecret", - client_id = "clientid", - tenant_id = "tenantid", - ) + assert fp.get_filesystem().config["account_name"] == "accountname" + assert fp.get_filesystem().config["client_secret"] == "clientsecret" + assert fp.get_filesystem().config["client_id"] == "clientid" + assert fp.get_filesystem().config["tenant_id"] == "tenantid" -@mock.patch("obstore.store.AzureStore.from_env") -def test_initialise_azure_file_provider_with_default_credential(mock_from_env): +def test_initialise_azure_file_provider_with_default_credential(): with mock.patch.dict( os.environ, { @@ -208,11 +196,7 @@ def test_initialise_azure_file_provider_with_default_credential(mock_from_env): ): fp = FileAccessProvider("/tmp", "abfs://container/path/within/container") - mock_from_env.return_value = mock.Mock() - mock_from_env.assert_called_with( - "", - account_name = "accountname", - ) + assert fp.get_filesystem().config["account_name"] == "accountname" def test_get_file_system(): diff --git a/tests/flytekit/unit/core/test_flyte_directory.py b/tests/flytekit/unit/core/test_flyte_directory.py index 8c80836356..5b3b41115f 100644 --- a/tests/flytekit/unit/core/test_flyte_directory.py +++ b/tests/flytekit/unit/core/test_flyte_directory.py @@ -319,7 +319,7 @@ def test_directory_guess(): assert fft.extension() == "" -@mock.patch("flytekit.core.obstore_filesystem.ObstoreS3FileSystem._ls") +@mock.patch("obstore.fsspec.FsspecStore._ls") @mock.patch("flytekit.core.data_persistence.FileAccessProvider.get_data") def test_list_dir(mock_get_data, mock_ls): remote_dir = "s3://test-flytedir" diff --git a/tests/flytekit/unit/remote/test_fs_remote.py b/tests/flytekit/unit/remote/test_fs_remote.py index 5c635376b4..274ad7bc45 100644 --- a/tests/flytekit/unit/remote/test_fs_remote.py +++ b/tests/flytekit/unit/remote/test_fs_remote.py @@ -7,6 +7,7 @@ import fsspec import pytest from fsspec.implementations.http import HTTPFileSystem +from obstore.fsspec import register from flytekit.configuration import Config from flytekit.core.data_persistence import FileAccessProvider @@ -118,6 +119,8 @@ def test_remote_upload_with_data_persistence(sandbox_remote): @pytest.mark.parametrize("url_prefix", ["s3://my-s3-bucket", "abfs://my-azure-container", "abfss://my-azure-container", "gcs://my-gcs-bucket"]) def test_common_matching(url_prefix): + # ensure all url_prefix are registered + register(["s3", "abfs", "abfss", "gcs"]) urls = [ url_prefix + url_suffix for url_suffix in [ From aad1b5e84443fece1db3f663467456d50cf03597 Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 18 Mar 2025 22:09:28 +0800 Subject: [PATCH 29/42] fix: set async to true & update syntax Signed-off-by: machichima --- flytekit/core/data_persistence.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index 74322e8f20..aa54f62a48 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -93,7 +93,7 @@ def s3_setup_args(s3_cfg: configuration.S3Config, anonymous: bool = False, **kwa "retry_timeout": timedelta(minutes=3), } - client_options = {"timeout": "99999s", "allow_http": "true"} + client_options = {"timeout": "99999s", "allow_http": True} if config: kwargs["config"] = config @@ -750,7 +750,7 @@ async def async_put_data( put_data = loop_manager.synced(async_put_data) -register(["s3", "gs", "abfs"]) +register(["s3", "gs", "abfs"], asynchronous=True) flyte_tmp_dir = tempfile.mkdtemp(prefix="flyte-") From d79c61d979feaa766c275d70640a40a704e5e14f Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 18 Mar 2025 22:09:59 +0800 Subject: [PATCH 30/42] build: set obstore version and remove s3fs, gcsfs, adlfs Signed-off-by: machichima --- pyproject.toml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d471779c5a..de58f7388a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,6 @@ readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.9,<3.13" dependencies = [ # Please maintain an alphabetical order in the following list - "adlfs>=2023.3.0", "click>=6.6", "cloudpickle>=2.0.0", "croniter>=0.3.20", @@ -22,7 +21,6 @@ dependencies = [ "docstring-parser>=0.9.0", "flyteidl>=1.15.1", "fsspec>=2023.3.0", - "gcsfs>=2023.3.0", "googleapis-common-protos>=1.57", # Skipping those versions to account for the unwanted output coming from grpcio and grpcio-status. # Issue being tracked in https://github.com/flyteorg/flyte/issues/6082. @@ -38,7 +36,7 @@ dependencies = [ "marshmallow-jsonschema>=0.12.0", "mashumaro>=3.15", "msgpack>=1.1.0", - "obstore", + "obstore==0.5.1", "protobuf!=4.25.0", "pygments", "python-json-logger>=2.0.0", @@ -47,7 +45,6 @@ dependencies = [ "requests>=2.18.4", "rich", "rich_click", - "s3fs>=2023.3.0,!=2024.3.1", "statsd>=3.0.0", "typing_extensions", "urllib3>=1.22", From 2a67bac261e9c58a6b04c8210bda808f77e6559d Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 18 Mar 2025 22:24:34 +0800 Subject: [PATCH 31/42] fix: _ANON to _SKIP_SIGNATURE Signed-off-by: machichima --- flytekit/core/data_persistence.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index aa54f62a48..1bf8f60c7c 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -54,7 +54,7 @@ # for key and secret _FSSPEC_S3_KEY_ID = "access_key_id" _FSSPEC_S3_SECRET = "secret_access_key" -_ANON = "skip_signature" +_SKIP_SIGNATURE = "skip_signature" Uploadable = typing.Union[str, os.PathLike, pathlib.Path, bytes, io.BufferedReader, io.BytesIO, io.StringIO] @@ -81,7 +81,7 @@ def s3_setup_args(s3_cfg: configuration.S3Config, anonymous: bool = False, **kwa backoff = kwargs.pop("backoff", s3_cfg.backoff) if anonymous: - config[_ANON] = True + config[_SKIP_SIGNATURE] = True retry_config = { "max_retries": retries, @@ -135,7 +135,7 @@ def azure_setup_args( config["tenant_id"] = kwargs.get("tenant_id", azure_cfg.tenant_id) if anonymous: - config[_ANON] = True + config[_SKIP_SIGNATURE] = True client_options = {"timeout": "99999s", "allow_http": "true"} From 3244d82f20cd66bebe14d1033b8788dae29ef41a Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 18 Mar 2025 22:32:38 +0800 Subject: [PATCH 32/42] fix: remove redundant gs_setup_args function Signed-off-by: machichima --- flytekit/core/data_persistence.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index 1bf8f60c7c..dfed91fbc2 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -103,15 +103,6 @@ def s3_setup_args(s3_cfg: configuration.S3Config, anonymous: bool = False, **kwa return kwargs -def gs_setup_args(gcs_cfg: configuration.GCSConfig, anonymous: bool = False) -> Dict[str, Any]: - """ - Setup gcs storage, bucket is needed to create obstore store object - """ - kwargs: Dict[str, Any] = {} - - return kwargs - - def azure_setup_args( azure_cfg: configuration.AzureBlobStorageConfig, anonymous: bool = False, @@ -162,7 +153,7 @@ def get_fsspec_storage_options( **kwargs, } if protocol == "gs": - return {**gs_setup_args(data_config.gcs, anonymous=anonymous), **kwargs} + return kwargs if protocol in ("abfs", "abfss"): return { **azure_setup_args(data_config.azure, anonymous=anonymous, **kwargs), @@ -285,9 +276,7 @@ def get_filesystem( s3kwargs.update(kwargs) return fsspec.filesystem(protocol, **s3kwargs) # type: ignore elif protocol == "gs": - gskwargs = gs_setup_args(self._data_config.gcs, anonymous=anonymous) - gskwargs.update(kwargs) - return fsspec.filesystem(protocol, **gskwargs) # type: ignore + return fsspec.filesystem(protocol, **kwargs) # type: ignore elif protocol == "abfs": azkwargs = azure_setup_args(self._data_config.azure, anonymous=anonymous, **kwargs) azkwargs.update(kwargs) From 83f4db57a73c242a2fdef0bfd2d101864a0473e0 Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 18 Mar 2025 22:39:28 +0800 Subject: [PATCH 33/42] feat: add abfss Signed-off-by: machichima --- flytekit/core/data_persistence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index dfed91fbc2..1a5e687281 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -739,7 +739,7 @@ async def async_put_data( put_data = loop_manager.synced(async_put_data) -register(["s3", "gs", "abfs"], asynchronous=True) +register(["s3", "gs", "abfs", "abfss"], asynchronous=True) flyte_tmp_dir = tempfile.mkdtemp(prefix="flyte-") From 6346fc3f2d287205a35e477c8ddf1ef8bb644f13 Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 18 Mar 2025 22:48:25 +0800 Subject: [PATCH 34/42] fix: also add abfss in get_filesystem Signed-off-by: machichima --- flytekit/core/data_persistence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flytekit/core/data_persistence.py b/flytekit/core/data_persistence.py index 1a5e687281..59301fa169 100644 --- a/flytekit/core/data_persistence.py +++ b/flytekit/core/data_persistence.py @@ -277,7 +277,7 @@ def get_filesystem( return fsspec.filesystem(protocol, **s3kwargs) # type: ignore elif protocol == "gs": return fsspec.filesystem(protocol, **kwargs) # type: ignore - elif protocol == "abfs": + elif protocol in ("abfs", "abfss"): azkwargs = azure_setup_args(self._data_config.azure, anonymous=anonymous, **kwargs) azkwargs.update(kwargs) return fsspec.filesystem(protocol, **azkwargs) # type: ignore From dfbf8544b0054a580dfb3ee9d86da598f2bbfc79 Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 18 Mar 2025 22:59:46 +0800 Subject: [PATCH 35/42] build: udpate dependencies Signed-off-by: machichima --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index de58f7388a..f5ca524ae1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.9,<3.13" dependencies = [ # Please maintain an alphabetical order in the following list + "aiohttp>=3.11.13", "click>=6.6", "cloudpickle>=2.0.0", "croniter>=0.3.20", From 54c57ed468cb24c74fbb26b2c134c6260ed80ff8 Mon Sep 17 00:00:00 2001 From: machichima Date: Wed, 19 Mar 2025 23:01:44 +0800 Subject: [PATCH 36/42] build: add botocore Signed-off-by: machichima --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index f5ca524ae1..ddc9d8b684 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ requires-python = ">=3.9,<3.13" dependencies = [ # Please maintain an alphabetical order in the following list "aiohttp>=3.11.13", + "botocore>=1.37.15", "click>=6.6", "cloudpickle>=2.0.0", "croniter>=0.3.20", From 4beb285f44b309798268a7d21a4839748fd078aa Mon Sep 17 00:00:00 2001 From: machichima Date: Thu, 20 Mar 2025 21:12:45 +0800 Subject: [PATCH 37/42] fix: mock listdir Signed-off-by: machichima --- tests/flytekit/unit/core/test_flyte_directory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/flytekit/unit/core/test_flyte_directory.py b/tests/flytekit/unit/core/test_flyte_directory.py index 5b3b41115f..0ba355b8e4 100644 --- a/tests/flytekit/unit/core/test_flyte_directory.py +++ b/tests/flytekit/unit/core/test_flyte_directory.py @@ -319,7 +319,7 @@ def test_directory_guess(): assert fft.extension() == "" -@mock.patch("obstore.fsspec.FsspecStore._ls") +@mock.patch("obstore.fsspec.FsspecStore.listdir") @mock.patch("flytekit.core.data_persistence.FileAccessProvider.get_data") def test_list_dir(mock_get_data, mock_ls): remote_dir = "s3://test-flytedir" From 568aa90d837aa3845723303f8f3a19f180456cab Mon Sep 17 00:00:00 2001 From: machichima Date: Thu, 20 Mar 2025 23:52:51 +0800 Subject: [PATCH 38/42] build: add dependence s3fs in async-fsspec plugin Signed-off-by: machichima --- plugins/flytekit-async-fsspec/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/flytekit-async-fsspec/setup.py b/plugins/flytekit-async-fsspec/setup.py index 414658365a..13aca40f51 100644 --- a/plugins/flytekit-async-fsspec/setup.py +++ b/plugins/flytekit-async-fsspec/setup.py @@ -4,7 +4,7 @@ microlib_name = "flytekitplugins-async-fsspec" -plugin_requires = ["flytekit"] +plugin_requires = ["flytekit", "s3fs>=2023.3.0,!=2024.3.1"] __version__ = "0.0.0+develop" From c90a2ff747220c54cd3de554452eda3fe887fbce Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 8 Apr 2025 21:22:06 +0800 Subject: [PATCH 39/42] build: upgrade obstore to 0.6.0 Signed-off-by: machichima --- Dockerfile.dev | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile.dev b/Dockerfile.dev index 5d2b622f01..ea51253710 100644 --- a/Dockerfile.dev +++ b/Dockerfile.dev @@ -40,7 +40,7 @@ RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_FLYTEKIT=$PSEUDO_VERSION \ -e /flytekit \ -e /flytekit/plugins/flytekit-deck-standard \ -e /flytekit/plugins/flytekit-flyteinteractive \ - obstore==0.3.0b9 \ + obstore==0.6.0 \ markdown \ pandas \ pillow \ diff --git a/pyproject.toml b/pyproject.toml index ddc9d8b684..37f5aaa6d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ dependencies = [ "marshmallow-jsonschema>=0.12.0", "mashumaro>=3.15", "msgpack>=1.1.0", - "obstore==0.5.1", + "obstore==0.6.0", "protobuf!=4.25.0", "pygments", "python-json-logger>=2.0.0", From 792a3ec53e71b92ad1bed99b9ac5158610168044 Mon Sep 17 00:00:00 2001 From: machichima Date: Thu, 8 May 2025 09:20:30 +0800 Subject: [PATCH 40/42] fix: add missing deps -e Signed-off-by: machichima --- Dockerfile.dev | 1 + pyproject.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/Dockerfile.dev b/Dockerfile.dev index ea51253710..257e6de75b 100644 --- a/Dockerfile.dev +++ b/Dockerfile.dev @@ -42,6 +42,7 @@ RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_FLYTEKIT=$PSEUDO_VERSION \ -e /flytekit/plugins/flytekit-flyteinteractive \ obstore==0.6.0 \ markdown \ + decorator \ pandas \ pillow \ plotly \ diff --git a/pyproject.toml b/pyproject.toml index dc690cbad7..285f489112 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "msgpack>=1.1.0", "obstore==0.6.0", "protobuf!=4.25.0", + "pydantic>=2.11.4", "pygments", "python-json-logger>=2.0.0", "pytimeparse>=1.1.8", From ed5b62d4d04c43b542aa1b158e73a5a0faab5d6f Mon Sep 17 00:00:00 2001 From: machichima Date: Thu, 8 May 2025 09:21:11 +0800 Subject: [PATCH 41/42] test: printing detailed error info -e Signed-off-by: machichima --- tests/flytekit/integration/remote/test_remote.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/flytekit/integration/remote/test_remote.py b/tests/flytekit/integration/remote/test_remote.py index c5e908beaa..734e482ee2 100644 --- a/tests/flytekit/integration/remote/test_remote.py +++ b/tests/flytekit/integration/remote/test_remote.py @@ -132,7 +132,10 @@ def test_pydantic_default_input_with_map_task(): remote = FlyteRemote(Config.auto(config_file=CONFIG), PROJECT, DOMAIN) execution = remote.fetch_execution(name=execution_id) execution = remote.wait(execution=execution, timeout=datetime.timedelta(minutes=5)) - print("Execution Error:", execution.error) + # Print detailed error info + if execution.error: + print("Execution Error Message:", execution.error.message) + print("Error URI (if logs are stored remotely):", execution.error.error_uri) assert execution.closure.phase == WorkflowExecutionPhase.SUCCEEDED, f"Execution failed with phase: {execution.closure.phase}" From 564a1d1e3ac56f1380befcc8ae55bd7aff5a15b1 Mon Sep 17 00:00:00 2001 From: machichima Date: Thu, 8 May 2025 20:34:09 +0800 Subject: [PATCH 42/42] build: remove pydantic deps -e Signed-off-by: machichima --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 285f489112..dc690cbad7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,6 @@ dependencies = [ "msgpack>=1.1.0", "obstore==0.6.0", "protobuf!=4.25.0", - "pydantic>=2.11.4", "pygments", "python-json-logger>=2.0.0", "pytimeparse>=1.1.8",