airbytehq · aaronsteers · Jul 31, 2025 · Aug 8, 2025 · Aug 8, 2025 · Aug 8, 2025
diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py
@@ -20,6 +20,7 @@
 from airbyte.caches._state_backend import SqlStateBackend
 from airbyte.constants import DEFAULT_ARROW_MAX_CHUNK_SIZE, TEMP_FILE_CLEANUP
 from airbyte.datasets._sql import CachedDataset
+from airbyte.lakes import LakeStorage
 from airbyte.shared.catalog_providers import CatalogProvider
 from airbyte.shared.sql_processor import SqlConfig
 from airbyte.shared.state_writers import StdOutStateWriter
@@ -365,3 +366,72 @@ def _write_airbyte_message_stream(
             progress_tracker=progress_tracker,
         )
         progress_tracker.log_cache_processing_complete()
+
+    def fast_unload(
+        self,
+        lake_store: LakeStorage,
+        *,
+        streams: list[str] | Literal["*"] | None = None,
+    ) -> None:
+        """Unload the cache to a lake store.
+
+        We dump data directly to parquet files in the lake store.
+
+        Args:
+            streams: The streams to unload. If None, unload all streams.
+            lake_store: The lake store to unload to. If None, use the default lake store.
+        """
+        stream_names: list[str]
+        if streams == "*" or streams is None:
+            stream_names = self._catalog_backend.stream_names
+        elif isinstance(streams, list):
+            stream_names = streams
+
+        for stream_name in stream_names:
+            self._unload_stream_to_lake_store(
+                stream_name,
+                lake_store,
+            )
+
+    def _unload_stream_to_lake_store(
+        self,
+        stream_name: str,
+        lake_store: LakeStorage,
+    ) -> None:
+        """Unload a single stream to the lake store.
+
+        This generic implementation delegates to the `lake_store` and passes
+        an Arrow dataset to the lake store object.
+
+        Subclasses can override this method to provide a faster
+        unload implementation.
+        """
+        arrow_dataset = self.get_arrow_dataset(stream_name)
+        lake_store.write_dataset(
+            dataset=arrow_dataset,
+            table_name=stream_name,
+            schema=self.schema_name,
+            cache_dir=self.cache_dir,
+            cleanup=self.cleanup,
+        )
+
+    def _load_stream_from_lake_store(
+        self,
+        stream_name: str,
+        lake_store: LakeStorage,
+    ) -> None:
+        """Load a single stream from the lake store.
+
+        This generic implementation reads an Arrow dataset from the lake store
+        and writes it to the cache.
+
+        Subclasses can override this method to provide a faster
+        load implementation.
+        """
+        arrow_dataset = lake_store.read_dataset(
+            table_name=stream_name,
+            schema=self.schema_name,
+            cache_dir=self.cache_dir,
+            cleanup=self.cleanup,
+        )
+        self.processor.write_arrow_dataset(arrow_dataset, stream_name)
diff --git a/airbyte/caches/snowflake.py b/airbyte/caches/snowflake.py
@@ -68,6 +68,8 @@
 from airbyte.destinations._translate_cache_to_dest import (
     snowflake_cache_to_destination_configuration,
 )
+from airbyte.lakes import LakeStorage
+from airbyte.secrets.util import get_secret
 from airbyte.shared.sql_processor import RecordDedupeMode, SqlProcessorBase
 
 
@@ -86,6 +88,72 @@ def paired_destination_config(self) -> DestinationSnowflake:
         """Return a dictionary of destination configuration values."""
         return snowflake_cache_to_destination_configuration(cache=self)
 
+    def unload_stream_to_lake(
+        self,
+        stream_name: str,
+        lake_store: LakeStorage,
+    ) -> None:
+        """Unload a single stream to the lake store.
+
+        This generic implementation delegates to the `lake_store` and passes
+        an Arrow dataset to the lake store object.
+
+        Subclasses can override this method to provide a faster
+        unload implementation.
+        """
+        sql_table = self.streams[stream_name].to_sql_table()
+        table_name = sql_table.name
+        aws_access_key_id = get_secret("AWS_ACCESS_KEY_ID")
+        aws_secret_access_key = get_secret("AWS_SECRET_ACCESS_KEY")
+        unload_statement = "\n".join([
+            f"COPY INTO '{lake_store.get_stream_root_uri(stream_name)}'",
+            f"FROM {table_name}",
+            "CREDENTIALS=(",
+            f"  AWS_KEY_ID='{aws_access_key_id}'",
+            f"  AWS_SECRET_KEY='{aws_secret_access_key}'",
+            ")",
+            "FILE_FORMAT = (TYPE = 'PARQUET')",
+            "OVERWRITE = TRUE",
+        ])
+        self.execute_sql(unload_statement)
+
+        # To get the manifest data:
+        # self.query_sql("RESULT_SCAN(LAST_QUERY_ID())")
+
+    def load_stream_from_lake(
+        self,
+        stream_name: str,
+        lake_store: LakeStorage,
+        *,
+        zero_copy: bool = False,
+    ) -> None:
+        """Load a single stream from the lake store.
+
+        This generic implementation delegates to the `lake_store` and passes
+        an Arrow dataset to the lake store object.
+
+        Subclasses can override this method to provide a faster
+        unload implementation.
+        """
+        sql_table = self.streams[stream_name].to_sql_table()
+        table_name = sql_table.name
+        aws_access_key_id = get_secret(AWS_ACCESS_KEY_ID)
+        aws_secret_access_key = get_secret("AWS_SECRET_ACCESS_KEY")
+        if zero_copy:
+            # Zero-copy loading is not yet supported in Snowflake.
+            raise NotImplementedError("Zero-copy loading is not yet supported in Snowflake.")
+
+        load_statement = "\n".join([
+            f"COPY INTO {table_name}",
+            f"FROM '{lake_store.get_stream_root_uri(stream_name)}'",
+            "CREDENTIALS=(",
+            f"  AWS_KEY_ID='{aws_access_key_id}'",
+            f"  AWS_SECRET_KEY='{aws_secret_access_key}'",
+            ")",
+            "FILE_FORMAT = (TYPE = 'PARQUET')",
+        ])
+        self.execute_sql(load_statement)
+
 
 # Expose the Cache class and also the Config class.
 __all__ = [

diff --git a/airbyte/lakes.py b/airbyte/lakes.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
+"""PyAirbyte LakeStorage class."""
+from __future__ import annotations
+
+import abc
+from abc import abstractproperty
+
+
+class LakeStorage(abc.ABC):
+    """PyAirbyte LakeStorage class."""
+
+    @abstractproperty
+    def uri_protocol(self) -> str:
+        """Return the URI protocol for the lake storage.
+
+        E.g. "file://", "s3://", "gcs://", etc.
+        """
+        raise NotImplementedError("Subclasses must implement this method.")
+
+    @property
+    def root_storage_uri(self) -> str:
+        """Get the root URI for the lake storage."""
+        return f"{self.uri_protocol}{self.root_storage_path}/"
+
+    @property
+    def root_storage_path(self) -> str:
+        """Get the root path for the lake storage."""
+        return "airbyte/lake"
+
+    def path_to_uri(self, path: str) -> str:
+        """Convert a relative lake path to a URI."""
+        return f"{self.root_storage_uri}{path}"
+
+    def get_stream_root_path(
+        self,
+        stream_name: str,
+    ) -> str:
+        """Get the path for a stream in the lake storage."""
+        return f"{self.root_storage_path}/{stream_name}/"
+
+    def get_stream_root_uri(
+        self,
+        stream_name: str,
+    ) -> str:
+        """Get the URI root for a stream in the lake storage."""
+        return self.path_to_uri(self.get_stream_root_path(stream_name))
+
+
+class S3LakeStorage(LakeStorage):
+    """S3 Lake Storage implementation."""
+
+    def __init__(self, bucket_name: str, region: str, access_key_id: str, secret_access_key: str):
+        """Initialize S3LakeStorage with required parameters."""
+        self.bucket_name = bucket_name
+        self.region = region
+        self.access_key_id = access_key_id
+        self.secret_access_key = secret_access_key
+
+    @property
+    def uri_protocol(self) -> str:
+        """Return the URI protocol for S3."""
+        return "s3://"
+
+    @property
+    def root_storage_uri(self) -> str:
+        """Get the root URI for the S3 lake storage."""
+        return f"{self.uri_protocol}{self.bucket_name}/"
diff --git a/examples/run_fast_lake_copy.py b/examples/run_fast_lake_copy.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
+"""An example script to run a fast lake copy operation using PyAirbyte.
+
+Usage:
+    poetry run python examples/run_fast_lake_copy.py
+
+Required secrets:
+  - SNOWFLAKE_PASSWORD: Password for Snowflake connection.
+  - AWS_ACCESS_KEY_ID: AWS access key ID for S3 connection.
+  - AWS_SECRET_ACCESS_KEY: AWS secret access key for S3 connection.
+"""
+from numpy import source
-from numpy import source
-from numpy import source
+
+import airbyte as ab
+from airbyte.caches.snowflake import SnowflakeCache
+from airbyte.lakes import S3LakeStorage
+from airbyte.secrets.google_gsm import GoogleGSMSecretManager
+
+
+AIRBYTE_INTERNAL_GCP_PROJECT = "dataline-integration-testing"
+secret_mgr = GoogleGSMSecretManager(
+    project=AIRBYTE_INTERNAL_GCP_PROJECT,
+    credentials_json=ab.get_secret("GCP_GSM_CREDENTIALS"),
+)
+
+secret = secret_mgr.get_secret(
+    secret_name="AIRBYTE_LIB_SNOWFLAKE_CREDS",
+)
+assert secret is not None, "Secret not found."
+secret_config = secret.parse_json()
+
+source = ab.get_source(
+    "source-faker",
+    config={
+        "count": 1000,
+        "seed": 0,
+        "parallelism": 1,
+        "always_updated": False,
+    },
+    install_if_missing=True,
+    streams=["products"],
+)
+
+snowflake_cache_a = SnowflakeCache(
+    account=secret_config["account"],
+    username=secret_config["username"],
+    password=secret_config["password"],
+    database=secret_config["database"],
+    warehouse=secret_config["warehouse"],
+    role=secret_config["role"],
+    schema_name="test_fast_copy_source",
+)
+snowflake_cache_b = SnowflakeCache(
+    account=secret_config["account"],
+    username=secret_config["username"],
+    password=secret_config["password"],
+    database=secret_config["database"],
+    warehouse=secret_config["warehouse"],
+    role=secret_config["role"],
+    schema_name="test_fast_copy_dest",
+)
+
+s3_lake = S3LakeStorage(
+    bucket_name="mybucket",
+    region="us-west-2",
+    access_key_id=ab.get_secret("AWS_ACCESS_KEY_ID"),
+    secret_access_key=ab.get_secret("AWS_SECRET_ACCESS_KEY"),
+)
+
+# Begin processing
+source.read(cache=snowflake_cache_a)
+
+snowflake_cache_a.unload_stream_to_lake(
+    stream_name="products",
+    lake_store=s3_lake,
+)
+
+snowflake_cache_b.load_stream_from_lake(
+    stream_name="products",
+    lake_store=s3_lake,
+    zero_copy=True,  # Set to True for zero-copy loading if supported.
+)