airbytehq · aaronsteers · Jul 31, 2025 · Aug 8, 2025 · Aug 8, 2025 · Aug 8, 2025
diff --git a/airbyte/__init__.py b/airbyte/__init__.py
@@ -131,6 +131,7 @@
 from airbyte.datasets import CachedDataset
 from airbyte.destinations.base import Destination
 from airbyte.destinations.util import get_destination
+from airbyte.lakes import GCSLakeStorage, LakeStorage, S3LakeStorage
 from airbyte.records import StreamRecord
 from airbyte.results import ReadResult, WriteResult
 from airbyte.secrets import SecretSourceEnum, get_secret
@@ -154,6 +155,7 @@
         documents,
         exceptions,  # noqa: ICN001  # No 'exc' alias for top-level module
         experimental,
+        lakes,
         logs,
         mcp,
         records,
@@ -175,6 +177,7 @@
     "documents",
     "exceptions",
     "experimental",
+    "lakes",
     "logs",
     "mcp",
     "records",
@@ -195,7 +198,10 @@
     "CachedDataset",
     "Destination",
     "DuckDBCache",
+    "GCSLakeStorage",
+    "LakeStorage",
     "ReadResult",
+    "S3LakeStorage",
     "SecretSourceEnum",
     "Source",
     "StreamRecord",

diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py
@@ -20,6 +20,10 @@
 from airbyte.caches._state_backend import SqlStateBackend
 from airbyte.constants import DEFAULT_ARROW_MAX_CHUNK_SIZE, TEMP_FILE_CLEANUP
 from airbyte.datasets._sql import CachedDataset
+
+
+if TYPE_CHECKING:
+    from airbyte.lakes import LakeStorage
 from airbyte.shared.catalog_providers import CatalogProvider
 from airbyte.shared.sql_processor import SqlConfig
 from airbyte.shared.state_writers import StdOutStateWriter
@@ -365,3 +369,73 @@ def _write_airbyte_message_stream(
             progress_tracker=progress_tracker,
         )
         progress_tracker.log_cache_processing_complete()
+
+    def fast_unload(
+        self,
+        lake_store: LakeStorage,
+        *,
+        streams: list[str] | Literal["*"] | None = None,
+    ) -> None:
+        """Unload the cache to a lake store.
+
+        We dump data directly to parquet files in the lake store.
+
+        Args:
+            streams: The streams to unload. If None, unload all streams.
+            lake_store: The lake store to unload to. If None, use the default lake store.
+        """
+        stream_names: list[str]
+        if streams == "*" or streams is None:
+            stream_names = self._catalog_backend.stream_names
+        elif isinstance(streams, list):
+            stream_names = streams
+
+        for stream_name in stream_names:
+            self._unload_stream_to_lake_store(
+                stream_name,
+                lake_store,
+            )
+
+    def _unload_stream_to_lake_store(
+        self,
+        stream_name: str,
+        lake_store: LakeStorage,
+    ) -> None:
+        """Unload a single stream to the lake store.
+
+        This generic implementation delegates to the `lake_store` and passes
+        an Arrow dataset to the lake store object.
+
+        Subclasses can override this method to provide a faster
+        unload implementation.
+        """
+        arrow_dataset = self.get_arrow_dataset(stream_name)
+        lake_store.write_dataset(
+            dataset=arrow_dataset,
+            table_name=stream_name,
+            schema=self.schema_name,
+            cache_dir=self.cache_dir,
+            cleanup=self.cleanup,
+        )
+
+    def _load_stream_from_lake_store(
+        self,
+        stream_name: str,
+        lake_store: LakeStorage,
+    ) -> None:
+        """Load a single stream from the lake store.
+
+        This generic implementation reads an Arrow dataset from the lake store
+        and writes it to the cache.
+
+        Subclasses can override this method to provide a faster
+        load implementation.
+        """
+        _ = lake_store.read_dataset(
+            table_name=stream_name,
+            schema=self.schema_name,
+            cache_dir=self.cache_dir,
+            cleanup=self.cleanup,
+        )
+        # self.processor.write_arrow_dataset(arrow_dataset, stream_name)
+        raise NotImplementedError("Loading from lake store to cache is not yet implemented")
diff --git a/airbyte/caches/bigquery.py b/airbyte/caches/bigquery.py
@@ -31,6 +31,10 @@
 )
 
 
+if TYPE_CHECKING:
+    from airbyte.lakes import LakeStorage
+
+
 if TYPE_CHECKING:
     from airbyte.shared.sql_processor import SqlProcessorBase
 
@@ -63,6 +67,65 @@ def get_arrow_dataset(
             "Please consider using a different cache implementation for these functionalities."
         )
 
+    def unload_stream_to_lake(
+        self,
+        stream_name: str,
+        lake_store: LakeStorage,
+    ) -> None:
+        """Unload a single stream to the lake store using BigQuery EXPORT DATA.
+
+        This implementation uses BigQuery's native EXPORT DATA functionality
+        to write directly to GCS, bypassing the Arrow dataset limitation.
+        """
+        sql_table = self.streams[stream_name].to_sql_table()
+        table_name = sql_table.name
+
+        if not hasattr(lake_store, "bucket_name"):
+            raise NotImplementedError("BigQuery unload currently only supports GCS lake storage")
+
+        export_uri = f"{lake_store.get_stream_root_uri(stream_name)}*.parquet"
+
+        export_statement = f"""
+            EXPORT DATA OPTIONS(
+                uri='{export_uri}',
+                format='PARQUET',
+                overwrite=true
+            ) AS
+            SELECT * FROM {self._read_processor.sql_config.schema_name}.{table_name}
+        """
+
+        self.execute_sql(export_statement)
+
+    def load_stream_from_lake(
+        self,
+        stream_name: str,
+        lake_store: LakeStorage,
+        *,
+        zero_copy: bool = False,  # noqa: ARG002
+    ) -> None:
+        """Load a single stream from the lake store using BigQuery LOAD DATA.
+
+        This implementation uses BigQuery's native LOAD DATA functionality
+        to read directly from GCS, bypassing the Arrow dataset limitation.
+        """
+        sql_table = self.streams[stream_name].to_sql_table()
+        table_name = sql_table.name
+
+        if not hasattr(lake_store, "bucket_name"):
+            raise NotImplementedError("BigQuery load currently only supports GCS lake storage")
+
+        source_uri = f"{lake_store.get_stream_root_uri(stream_name)}*.parquet"
+
+        load_statement = f"""
+            LOAD DATA INTO {self._read_processor.sql_config.schema_name}.{table_name}
+            FROM FILES (
+                format = 'PARQUET',
+                uris = ['{source_uri}']
+            )
+        """
+
+        self.execute_sql(load_statement)
+
 
 # Expose the Cache class and also the Config class.
 __all__ = [

diff --git a/airbyte/caches/snowflake.py b/airbyte/caches/snowflake.py
@@ -59,7 +59,7 @@
 
 from __future__ import annotations
 
-from typing import ClassVar
+from typing import TYPE_CHECKING, ClassVar
 
 from airbyte_api.models import DestinationSnowflake
 
@@ -68,6 +68,11 @@
 from airbyte.destinations._translate_cache_to_dest import (
     snowflake_cache_to_destination_configuration,
 )
+
+
+if TYPE_CHECKING:
+    from airbyte.lakes import LakeStorage
+from airbyte.secrets.util import get_secret
 from airbyte.shared.sql_processor import RecordDedupeMode, SqlProcessorBase
 
 
@@ -86,6 +91,92 @@ def paired_destination_config(self) -> DestinationSnowflake:
         """Return a dictionary of destination configuration values."""
         return snowflake_cache_to_destination_configuration(cache=self)
 
+    def unload_stream_to_lake(
+        self,
+        stream_name: str,
+        lake_store: LakeStorage,
+    ) -> None:
+        """Unload a single stream to the lake store using Snowflake COPY INTO.
+
+        This implementation uses Snowflake's COPY INTO command to unload data
+        directly to S3 in Parquet format with managed artifacts for optimal performance.
+
+        Args:
+            stream_name: The name of the stream to unload.
+            lake_store: The lake store to unload to.
+        """
+        sql_table = self.streams[stream_name].to_sql_table()
+        table_name = sql_table.name
+        aws_access_key_id = get_secret("AWS_ACCESS_KEY_ID")
+        aws_secret_access_key = get_secret("AWS_SECRET_ACCESS_KEY")
+
+        artifact_prefix = lake_store.get_artifact_prefix()
+        file_format_name = f"{artifact_prefix}PARQUET_FORMAT"
+        create_format_sql = f"""
+            CREATE FILE FORMAT IF NOT EXISTS {file_format_name}
+            TYPE = PARQUET
+            COMPRESSION = SNAPPY
+        """
+        self.execute_sql(create_format_sql)
+
+        stage_name = f"{artifact_prefix}STAGE"
+        create_stage_sql = f"""
+            CREATE STAGE IF NOT EXISTS {stage_name}
+            URL = '{lake_store.root_storage_uri}'
+            CREDENTIALS = (
+                AWS_KEY_ID = '{aws_access_key_id}'
+                AWS_SECRET_KEY = '{aws_secret_access_key}'
+            )
+            FILE_FORMAT = {file_format_name}
+        """
+        self.execute_sql(create_stage_sql)
+
+        unload_statement = f"""
+            COPY INTO @{stage_name}/{stream_name}/
+            FROM {self._read_processor.sql_config.schema_name}.{table_name}
+            FILE_FORMAT = {file_format_name}
+            OVERWRITE = TRUE
+        """
+        self.execute_sql(unload_statement)
+
+    def load_stream_from_lake(
+        self,
+        stream_name: str,
+        lake_store: LakeStorage,
+        *,
+        zero_copy: bool = False,
+    ) -> None:
+        """Load a single stream from the lake store using Snowflake COPY INTO.
+
+        This implementation uses Snowflake's COPY INTO command to load data
+        directly from S3 in Parquet format with managed artifacts for optimal performance.
+
+        Args:
+            stream_name: The name of the stream to load.
+            lake_store: The lake store to load from.
+            zero_copy: Whether to use zero-copy loading. If True, the data will be
+                loaded without copying it to the cache. This is useful for large datasets
+                that don't need to be stored in the cache.
+        """
+        sql_table = self.streams[stream_name].to_sql_table()
+        table_name = sql_table.name
+
+        if zero_copy:
+            raise NotImplementedError("Zero-copy loading is not yet supported in Snowflake.")
+
+        artifact_prefix = lake_store.get_artifact_prefix()
+        file_format_name = f"{artifact_prefix}PARQUET_FORMAT"
+        stage_name = f"{artifact_prefix}STAGE"
+
+        load_statement = f"""
+            COPY INTO {self._read_processor.sql_config.schema_name}.{table_name}
+            FROM @{stage_name}/{stream_name}/
+            FILE_FORMAT = {file_format_name}
+            MATCH_BY_COLUMN_NAME = CASE_INSENSITIVE
+            PURGE = FALSE
+        """
+        self.execute_sql(load_statement)
+
 
 # Expose the Cache class and also the Config class.
 __all__ = [