airbytehq · aaronsteers · Jul 31, 2025 · Aug 8, 2025 · Aug 8, 2025 · Aug 8, 2025
diff --git a/airbyte/__init__.py b/airbyte/__init__.py
@@ -131,6 +131,7 @@
 from airbyte.datasets import CachedDataset
 from airbyte.destinations.base import Destination
 from airbyte.destinations.util import get_destination
+from airbyte.lakes import GCSLakeStorage, LakeStorage, S3LakeStorage
 from airbyte.records import StreamRecord
 from airbyte.results import ReadResult, WriteResult
 from airbyte.secrets import SecretSourceEnum, get_secret
@@ -154,6 +155,7 @@
         documents,
         exceptions,  # noqa: ICN001  # No 'exc' alias for top-level module
         experimental,
+        lakes,
         logs,
         mcp,
         records,
@@ -175,6 +177,7 @@
     "documents",
     "exceptions",
     "experimental",
+    "lakes",
     "logs",
     "mcp",
     "records",
@@ -195,7 +198,10 @@
     "CachedDataset",
     "Destination",
     "DuckDBCache",
+    "GCSLakeStorage",
+    "LakeStorage",
     "ReadResult",
+    "S3LakeStorage",
     "SecretSourceEnum",
     "Source",
     "StreamRecord",

diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py
@@ -20,6 +20,10 @@
 from airbyte.caches._state_backend import SqlStateBackend
 from airbyte.constants import DEFAULT_ARROW_MAX_CHUNK_SIZE, TEMP_FILE_CLEANUP
 from airbyte.datasets._sql import CachedDataset
+
+
+if TYPE_CHECKING:
+    from airbyte.lakes import FastUnloadResult, LakeStorage
 from airbyte.shared.catalog_providers import CatalogProvider
 from airbyte.shared.sql_processor import SqlConfig
 from airbyte.shared.state_writers import StdOutStateWriter
@@ -74,6 +78,7 @@ def paired_destination_config(self) -> Any | dict[str, Any]:  # noqa: ANN401  #
             "configuration."
         )
 
+    @final
     def __init__(self, **data: Any) -> None:  # noqa: ANN401
         """Initialize the cache and backends."""
         super().__init__(**data)
@@ -107,6 +112,7 @@ def __init__(self, **data: Any) -> None:  # noqa: ANN401
             temp_file_cleanup=self.cleanup,
         )
 
+    @final
     @property
     def config_hash(self) -> str | None:
         """Return a hash of the cache configuration.
@@ -115,6 +121,7 @@ def config_hash(self) -> str | None:
         """
         return super(SqlConfig, self).config_hash
 
+    @final
     def execute_sql(self, sql: str | list[str]) -> None:
         """Execute one or more SQL statements against the cache's SQL backend.
 
@@ -145,6 +152,7 @@ def processor(self) -> SqlProcessorBase:
         """Return the SQL processor instance."""
         return self._read_processor
 
+    @final
     def get_record_processor(
         self,
         source_name: str,
@@ -178,6 +186,7 @@ def get_record_processor(
 
     # Read methods:
 
+    @final
     def get_records(
         self,
         stream_name: str,
@@ -251,6 +260,7 @@ def __bool__(self) -> bool:
         """
         return True
 
+    @final
     def get_state_provider(
         self,
         source_name: str,
@@ -266,6 +276,7 @@ def get_state_provider(
             destination_name=destination_name,
         )
 
+    @final
     def get_state_writer(
         self,
         source_name: str,
@@ -281,6 +292,7 @@ def get_state_writer(
             destination_name=destination_name,
         )
 
+    @final
     def register_source(
         self,
         source_name: str,
@@ -294,6 +306,7 @@ def register_source(
             incoming_stream_names=stream_names,
         )
 
+    @final
     def create_source_tables(
         self,
         source: Source,
@@ -330,20 +343,24 @@ def create_source_tables(
                 create_if_missing=True,
             )
 
+    @final
     def __getitem__(self, stream: str) -> CachedDataset:
         """Return a dataset by stream name."""
         return self.streams[stream]
 
+    @final
     def __contains__(self, stream: str) -> bool:
         """Return whether a stream is in the cache."""
         return stream in (self._catalog_backend.stream_names)
 
+    @final
     def __iter__(  # type: ignore [override]  # Overriding Pydantic model method
         self,
     ) -> Iterator[tuple[str, Any]]:
         """Iterate over the streams in the cache."""
         return ((name, dataset) for name, dataset in self.streams.items())
 
+    @final
     def _write_airbyte_message_stream(
         self,
         stdin: IO[str] | AirbyteMessageIterator,
@@ -365,3 +382,162 @@ def _write_airbyte_message_stream(
             progress_tracker=progress_tracker,
         )
         progress_tracker.log_cache_processing_complete()
+
+    @final
+    def fast_unload_streams(
+        self,
+        lake_store: LakeStorage,
+        *,
+        streams: list[str] | Literal["*"] | None = None,
+    ) -> list[FastUnloadResult]:
+        """Unload the cache to a lake store.
+
+        We dump data directly to parquet files in the lake store.
+
+        Args:
+            streams: The streams to unload. If None, unload all streams.
+            lake_store: The lake store to unload to. If None, use the default lake store.
+        """
+        stream_names: list[str]
+        if streams == "*" or streams is None:
+            stream_names = self._catalog_backend.stream_names
+        elif isinstance(streams, list):
+            stream_names = streams
+
+        return [
+            self.fast_unload_stream(stream_name, lake_store)
+            for stream_name in stream_names
+        ]
+
+    @final
+    def fast_unload_stream(
+        self,
+        stream_name: str,
+        lake_store: LakeStorage,
+        **kwargs,
+    ) -> FastUnloadResult:
+        """Unload a single stream to the lake store.
+
+        This generic implementation delegates to `fast_unload_table()`
+        which subclasses should override for database-specific fast operations.
+        """
+        if not hasattr(self, "fast_unload_table"):
+            raise NotImplementedError("Subclasses must implement `fast_unload_table()` method")
+
+        sql_table = self.streams[stream_name].to_sql_table()
+        table_name = sql_table.name
+
+        return self.fast_unload_table(
+            stream_name=stream_name,
+            table_name=table_name,
+            lake_store=lake_store,
+            lake_path_prefix=stream_name,
+            **kwargs,
+        )
+
+    def fast_unload_table(
+        self,
+        table_name: str,
+        lake_store: LakeStorage,
+        *,
+        stream_name: str | None = None,
+        db_name: str | None = None,
+        schema_name: str | None = None,
+        path_prefix: str | None = None,
+    ) -> FastUnloadResult:
+        """Fast-unload a specific table to the designated lake storage.
+
+        Subclasses should override this method to implement fast unloads.
+        """
+        raise NotImplementedError
+
+    @final
+    def fast_load_streams(
+        self,
+        lake_store: LakeStorage,
+        *,
+        streams: list[str],
+    ) -> None:
+        """Unload the cache to a lake store.
+
+        We dump data directly to parquet files in the lake store.
+
+        Args:
+            streams: The streams to unload. If None, unload all streams.
+            lake_store: The lake store to unload to. If None, use the default lake store.
+        """
+        for stream_name in streams:
+            self.fast_load_stream(
+                stream_name,
+                lake_store,
+            )
+
+    @final
+    def fast_load_stream(
+        self,
+        stream_name: str,
+        lake_store: LakeStorage,
+        lake_path_prefix: str,
+        *,
+        zero_copy: bool = False,
+    ) -> None:
+        """Load a single stream from the lake store using fast native LOAD operations."""
+        sql_table = self.streams[stream_name].to_sql_table()
+        table_name = sql_table.name
+
+        if zero_copy:
+            raise NotImplementedError("Zero-copy loading is not yet supported in Snowflake.")
+
+        self.fast_load_table(
+            table_name=table_name,
+            lake_store=lake_store,
+            lake_path_prefix=lake_path_prefix,
+            zero_copy=zero_copy,
+        )
+
+    def fast_load_table(
+        self,
+        table_name: str,
+        lake_store: LakeStorage,
+        lake_path_prefix: str,
+        *,
+        db_name: str | None = None,
+        schema_name: str | None = None,
+    ) -> None:
+        """Fast-unload a specific table to the designated lake storage.
+
+        Subclasses should override this method to implement fast unloads.
+        """
+        raise NotImplementedError
+
+    @final
+    def fast_load_stream_from_unload_result(
+        self,
+        stream_name: str,
+        unload_result: FastUnloadResult,
+        *,
+        zero_copy: bool = False,
+    ) -> None:
+        """Load the result of a fast unload operation."""
+        self.fast_load_stream(
+            stream_name=stream_name,
+            lake_store=unload_result.lake_store,
+            lake_path_prefix=unload_result.lake_path_prefix,
+            zero_copy=zero_copy,
+        )
+
+    @final
+    def fast_load_table_from_unload_result(
+        self,
+        table_name: str,
+        unload_result: FastUnloadResult,
+        *,
+        zero_copy: bool = False,
+    ) -> None:
+        """Load the result of a fast unload operation."""
+        self.fast_load_table(
+            table_name=table_name,
+            lake_store=unload_result.lake_store,
+            lake_path_prefix=unload_result.lake_path_prefix,
+            zero_copy=zero_copy,
+        )