airbytehq · aaronsteers · Mar 22, 2025 · Mar 21, 2025 · Mar 22, 2025 · Mar 22, 2025
diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py
@@ -4,7 +4,7 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, ClassVar, final
+from typing import IO, TYPE_CHECKING, Any, ClassVar, Literal, final
 
 import pandas as pd
 import pyarrow as pa
@@ -34,6 +34,7 @@
     from airbyte.shared.sql_processor import SqlProcessorBase
     from airbyte.shared.state_providers import StateProviderBase
     from airbyte.shared.state_writers import StateWriterBase
+    from airbyte.sources.base import Source
     from airbyte.strategies import WriteStrategy
 
 
@@ -293,6 +294,35 @@ def register_source(
             incoming_stream_names=stream_names,
         )
 
+    def create_source_tables(
+        self,
+        source: Source,
+        streams: Literal["*"] | list[str] | None = None,
+    ) -> None:
+        """Create tables in the cache for the provided source if they do not exist already.
+
+        Tables are created based upon the Source's catalog.
+
+        Args:
+            source: The source to create tables for.
+            streams: Stream names to create tables for. If None, use the Source's selected_streams
+                or "*" if neither is set. If "*", all available streams will be used.
+        """
+        if streams is None:
+            streams = source.get_selected_streams() or "*"
+
+        catalog_provider = CatalogProvider(source.get_configured_catalog(streams=streams))
+
+        # Ensure schema exists
+        self.processor._ensure_schema_exists()  # noqa: SLF001  # Accessing non-public member
+
+        # Create tables for each stream if they don't exist
+        for stream_name in catalog_provider.stream_names:
+            self.processor._ensure_final_table_exists(  # noqa: SLF001
+                stream_name=stream_name,
+                create_if_missing=True,
+            )
+
     def __getitem__(self, stream: str) -> CachedDataset:
         """Return a dataset by stream name."""
         return self.streams[stream]

diff --git a/tests/unit_tests/test_caches.py b/tests/unit_tests/test_caches.py
@@ -3,6 +3,7 @@
 
 from pathlib import Path
 
+
 from airbyte.caches.base import CacheBase
 from airbyte.caches.duckdb import DuckDBCache
 
@@ -60,3 +61,77 @@ def test_duck_db_cache_config_get_database_name_with_default_schema_name():
 
 def test_duck_db_cache_config_inheritance_from_sql_cache_config_base():
     assert issubclass(DuckDBCache, CacheBase)
+
+
+def test_create_source_tables(mocker):
+    """Test that the create_source_tables method correctly creates tables based on the source's catalog."""
+    # Import here to avoid circular imports
+    from airbyte_protocol.models import (
+        ConfiguredAirbyteCatalog,
+        ConfiguredAirbyteStream,
+    )
+
+    # Create a proper ConfiguredAirbyteCatalog for mocking
+    stream1 = ConfiguredAirbyteStream(
+        stream={
+            "name": "stream1",
+            "json_schema": {},
+            "supported_sync_modes": ["full_refresh"],
+        },
+        sync_mode="full_refresh",
+        destination_sync_mode="overwrite",
+    )
+    stream2 = ConfiguredAirbyteStream(
+        stream={
+            "name": "stream2",
+            "json_schema": {},
+            "supported_sync_modes": ["full_refresh"],
+        },
+        sync_mode="full_refresh",
+        destination_sync_mode="overwrite",
+    )
+    catalog = ConfiguredAirbyteCatalog(streams=[stream1, stream2])
+
+    # Mock the catalog provider
+    mock_catalog_provider = mocker.Mock()
+    mock_catalog_provider.stream_names = ["stream1", "stream2"]
+    mocker.patch(
+        "airbyte.shared.catalog_providers.CatalogProvider",
+        return_value=mock_catalog_provider,
+    )
+
+    # Mock a source with configured catalog and selected streams
+    mock_source = mocker.Mock()
+    mock_source.get_configured_catalog.return_value = catalog
+    mock_source.get_selected_streams.return_value = ["stream1"]
+
+    # Create a DuckDBCache instance with mocked processor
+    cache = DuckDBCache(db_path=UNIT_TEST_DB_PATH)
+
+    # Mock the processor property
+    mock_processor = mocker.Mock()
+    mocker.patch.object(
+        DuckDBCache, "processor", mocker.PropertyMock(return_value=mock_processor)
+    )
+
+    # Test with default (None) stream parameter - should use source's selected streams
+    cache.create_source_tables(mock_source)
+
+    # Verify the correct methods were called
+    mock_source.get_selected_streams.assert_called_once()
+    mock_source.get_configured_catalog.assert_called_once_with(streams=["stream1"])
+    mock_processor._ensure_schema_exists.assert_called_once()
+    assert mock_processor._ensure_final_table_exists.call_count == 2
+
+    # Reset mocks
+    mock_source.reset_mock()
+    mock_processor.reset_mock()
+
+    # Test with explicit stream list
+    cache.create_source_tables(mock_source, streams=["stream2"])
+
+    # Verify the correct methods were called
+    mock_source.get_selected_streams.assert_not_called()
+    mock_source.get_configured_catalog.assert_called_once_with(streams=["stream2"])
+    mock_processor._ensure_schema_exists.assert_called_once()
+    assert mock_processor._ensure_final_table_exists.call_count == 2