From 956ab46c37563f13c78ff3d067556430a2ce2c70 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Thu, 13 Mar 2025 01:42:07 +0100 Subject: [PATCH 01/25] Add lazy read to constructor --- .../parsers/model_to_component_factory.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 3eec82a45..b2b631058 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2704,6 +2704,27 @@ def create_simple_retriever( model.ignore_stream_slicer_parameters_on_paginated_requests or False ) + if model.lazy_read_pointer and not bool(self._connector_state_manager.get_stream_state(name, None)): + lazy_read_pointer = [InterpolatedString.create(path, parameters=model.parameters or {}) for path in model.lazy_read_pointer] + partition_router = self._create_component_from_model(model=model.partition_router, config=config) + stream_slicer = self._create_component_from_model(model=incremental_sync, config=config) if incremental_sync else SinglePartitionRouter(parameters={}) + + return LazySimpleRetriever( + name=name, + paginator=paginator, + primary_key=primary_key, + requester=requester, + record_selector=record_selector, + stream_slicer=stream_slicer, + request_option_provider=request_options_provider, + cursor=cursor, + config=config, + ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, + parameters=model.parameters or {}, + partition_router=partition_router, + lazy_read_pointer=lazy_read_pointer, + ) + if self._limit_slices_fetched or self._emit_connector_builder_messages: return SimpleRetrieverTestReadDecorator( name=name, From 3723a3fa9eaa012822cbf81534067d9d1a3ed817 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Thu, 13 Mar 2025 01:50:49 +0100 Subject: [PATCH 02/25] Add implementation --- .../parsers/model_to_component_factory.py | 3 + .../retrievers/simple_retriever.py | 111 ++++++++++++++++++ 2 files changed, 114 insertions(+) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index b2b631058..987f3b7e5 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -435,6 +435,7 @@ ) from airbyte_cdk.sources.declarative.retrievers import ( AsyncRetriever, + LazySimpleRetriever, SimpleRetriever, SimpleRetrieverTestReadDecorator, ) @@ -2647,6 +2648,8 @@ def create_simple_retriever( stop_condition_on_cursor: bool = False, client_side_incremental_sync: Optional[Dict[str, Any]] = None, transformations: List[RecordTransformation], + incremental_sync: Optional[Union[IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel]] = None, + **kwargs: Any, ) -> SimpleRetriever: decoder = ( self._create_component_from_model(model=model.decoder, config=config) diff --git a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py index df41e14a7..d2265b134 100644 --- a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +++ b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py @@ -8,6 +8,7 @@ from itertools import islice from typing import Any, Callable, Iterable, List, Mapping, Optional, Set, Tuple, Union +import dpath import requests from airbyte_cdk.models import AirbyteMessage @@ -18,6 +19,7 @@ from airbyte_cdk.sources.declarative.partition_routers.single_partition_router import ( SinglePartitionRouter, ) +from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import SubstreamPartitionRouter from airbyte_cdk.sources.declarative.requesters.paginators.no_pagination import NoPagination from airbyte_cdk.sources.declarative.requesters.paginators.paginator import Paginator from airbyte_cdk.sources.declarative.requesters.request_options import ( @@ -32,6 +34,7 @@ from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState from airbyte_cdk.utils.mapping_helpers import combine_mappings + FULL_REFRESH_SYNC_COMPLETE_KEY = "__ab_full_refresh_sync_complete" @@ -618,3 +621,111 @@ def _fetch_next_page( self.name, ), ) + + +class SafeResponse(requests.Response): + def __getattr__(self, name): + return getattr(requests.Response, name, None) + + @property + def content(self): + return super().content + + @content.setter + def content(self, value): + self._content = value.encode() if isinstance(value, str) else value + + +@dataclass +class LazySimpleRetriever(SimpleRetriever): + """ + A retriever that supports lazy loading from parent streams. + """ + partition_router: SubstreamPartitionRouter = field(init=True, repr=False, default=None) + lazy_read_pointer: Optional[List[InterpolatedString]] = None + + def _read_pages( + self, + records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]], + stream_state: Mapping[str, Any], + stream_slice: StreamSlice, + ) -> Iterable[Record]: + parent_stream_config = self.partition_router.parent_stream_configs[-1] + parent_stream = parent_stream_config.stream + + for parent_record in parent_stream.read_only_records(): + parent_record, parent_partition = self.partition_router.process_parent_record(parent_record, parent_stream.name) + if parent_record is None: + continue + + childs = self._extract_child_records(parent_record) + response = self._create_response(childs) + + yield from self._yield_records_with_pagination( + response, records_generator_fn, stream_state, stream_slice, parent_record, parent_stream_config + ) + + yield from [] + + def _extract_child_records(self, parent_record: Mapping) -> Mapping: + """Extract child records from a parent record based on lazy pointers.""" + if not self.lazy_read_pointer: + return parent_record + + path = [path.eval(self.config) for path in self.lazy_read_pointer] + return dpath.values(parent_record, path) if "*" in path else dpath.get(parent_record, path, default=[]) + + def _create_response(self, data: Mapping) -> SafeResponse: + """Create a SafeResponse with the given data.""" + response = SafeResponse() + response.content = json.dumps(data).encode("utf-8") + response.status_code = 200 + return response + + def _yield_records_with_pagination( + self, + response: requests.Response, + records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]], + stream_state: Mapping[str, Any], + stream_slice: StreamSlice, + parent_record: Record, + parent_stream_config: Any, + ) -> Iterable[Record]: + """Yield records, handling pagination if needed.""" + last_page_size, last_record = 0, None + + for record in records_generator_fn(response): + last_page_size += 1 + last_record = record + yield record + + next_page_token = self._next_page_token(response, last_page_size, last_record, None) + if next_page_token: + yield from self._paginate(next_page_token, records_generator_fn, stream_state, stream_slice, parent_record, parent_stream_config) + + def _paginate( + self, + next_page_token: Any, + records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]], + stream_state: Mapping[str, Any], + stream_slice: StreamSlice, + parent_record: Record, + parent_stream_config: Any, + ) -> Iterable[Record]: + """Handle pagination by fetching subsequent pages.""" + partition_field = parent_stream_config.partition_field.eval(self.config) + partition_value = dpath.get(parent_record, parent_stream_config.parent_key.eval(self.config)) + stream_slice = StreamSlice(partition={partition_field: partition_value, "parent_slice": {}}, cursor_slice=stream_slice.cursor_slice) + + while next_page_token: + response = self._fetch_next_page(stream_state, stream_slice, next_page_token) + last_page_size, last_record = 0, None + + for record in records_generator_fn(response): + last_page_size += 1 + last_record = record + yield record + + last_page_token_value = next_page_token.get("next_page_token") if next_page_token else None + next_page_token = self._next_page_token(response, last_page_size, last_record, last_page_token_value) + From 041632db1caa99fcd3c80dc26208b456fca49787 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Thu, 13 Mar 2025 01:53:14 +0100 Subject: [PATCH 03/25] Update partition router with process_parent_record --- .../substream_partition_router.py | 67 ++++++++++++------- 1 file changed, 44 insertions(+), 23 deletions(-) diff --git a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py index 73a747f02..d5869410d 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py @@ -1,10 +1,12 @@ # # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # + + import copy import logging from dataclasses import InitVar, dataclass -from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union +from typing import TYPE_CHECKING, Any, Iterable, Tuple, List, Mapping, MutableMapping, Optional, Union import dpath @@ -131,6 +133,40 @@ def _get_request_option( parent_config.request_option.inject_into_request(params, value, self.config) return params + def process_parent_record(self, parent_record: Union[AirbyteMessage, Record, Mapping], parent_stream_name: str) -> Tuple[Optional[Mapping], Optional[Mapping]]: + """ + Processes and extracts data from a parent record, handling different record types + and ensuring only valid types proceed. + + :param parent_record: The parent record to process. + :param parent_stream_name: The parent stream name associated with the record. + :return: Extracted record data and partition (if applicable). + :raises AirbyteTracedException: If the record type is invalid. + """ + if isinstance(parent_record, AirbyteMessage): + self.logger.warning( + f"Parent stream {parent_stream_name} returns records of type AirbyteMessage. " + f"This SubstreamPartitionRouter is not able to checkpoint incremental parent state." + ) + if parent_record.type == MessageType.RECORD: + return parent_record.record.data, {} + return None, None # Skip invalid or non-record data + + # Handle Record type + if isinstance(parent_record, Record): + parent_partition = ( + parent_record.associated_slice.partition if parent_record.associated_slice else {} + ) + return parent_record.data, parent_partition + + # Validate the record type + if not isinstance(parent_record, Mapping): + raise AirbyteTracedException( + message=f"Parent stream returned records as invalid type {type(parent_record)}" + ) + + return parent_record, {} + def stream_slices(self) -> Iterable[StreamSlice]: """ Iterate over each parent stream's record and create a StreamSlice for each record. @@ -163,28 +199,13 @@ def stream_slices(self) -> Iterable[StreamSlice]: # read_stateless() assumes the parent is not concurrent. This is currently okay since the concurrent CDK does # not support either substreams or RFR, but something that needs to be considered once we do for parent_record in parent_stream.read_only_records(): - parent_partition = None - # Skip non-records (eg AirbyteLogMessage) - if isinstance(parent_record, AirbyteMessage): - self.logger.warning( - f"Parent stream {parent_stream.name} returns records of type AirbyteMessage. This SubstreamPartitionRouter is not able to checkpoint incremental parent state." - ) - if parent_record.type == MessageType.RECORD: - parent_record = parent_record.record.data # type: ignore[union-attr, assignment] # record is always a Record - else: - continue - elif isinstance(parent_record, Record): - parent_partition = ( - parent_record.associated_slice.partition - if parent_record.associated_slice - else {} - ) - parent_record = parent_record.data - elif not isinstance(parent_record, Mapping): - # The parent_record should only take the form of a Record, AirbyteMessage, or Mapping. Anything else is invalid - raise AirbyteTracedException( - message=f"Parent stream returned records as invalid type {type(parent_record)}" - ) + # Process the parent record + parent_record, parent_partition = self.process_parent_record(parent_record, parent_stream.name) + + # Skip invalid or non-record data + if parent_record is None: + continue + try: partition_value = dpath.get( parent_record, # type: ignore [arg-type] From 604dbe82de6b2f6e932b2787e45026bc385b601a Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Thu, 13 Mar 2025 01:57:18 +0100 Subject: [PATCH 04/25] Re-generate models --- .../models/declarative_component_schema.py | 80 ++++++++++++------- .../retrievers/simple_retriever.py | 1 - 2 files changed, 52 insertions(+), 29 deletions(-) diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index f85ae7993..d66e322e0 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -609,7 +609,9 @@ class OAuthAuthenticator(BaseModel): scopes: Optional[List[str]] = Field( None, description="List of scopes that should be granted to the access token.", - examples=[["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"]], + examples=[ + ["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"] + ], title="Scopes", ) token_expiry_date: Optional[str] = Field( @@ -1078,24 +1080,28 @@ class OAuthConfigSpecification(BaseModel): class Config: extra = Extra.allow - oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = Field( - None, - description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", - examples=[ - {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, - { - "app_id": { - "type": "string", - "path_in_connector_config": ["info", "app_id"], - } - }, - ], - title="OAuth user input", + oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = ( + Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", + examples=[ + {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, + { + "app_id": { + "type": "string", + "path_in_connector_config": ["info", "app_id"], + } + }, + ], + title="OAuth user input", + ) ) - oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = Field( - None, - description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{{my_var}}`.\n- The nested resolution variables like `{{ {{my_nested_var}} }}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {{ {{my_var_a}}:{{my_var_b}} | base64Encoder }}\n + base64Decorer - decode from `base64` encoded string, {{ {{my_string_variable_or_string_value}} | base64Decoder }}\n + urlEncoder - encode the input string to URL-like format, {{ https://test.host.com/endpoint | urlEncoder}}\n + urlDecorer - decode the input url-encoded string into text format, {{ urlDecoder:https%3A%2F%2Fairbyte.io | urlDecoder}}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {{ {{state_value}} | codeChallengeS256 }}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{{client_id_key}}={{client_id_value}}&{{redirect_uri_key}}={{ {{redirect_uri_value}} | urlEncoder}}&{{state_key}}={{state_value}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{{ auth_code_key }}": "{{ auth_code_value }}",\n "{{ client_id_key }}": "{{ client_id_value }}",\n "{{ client_secret_key }}": "{{ client_secret_value }}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', - title="DeclarativeOAuth Connector Specification", + oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = ( + Field( + None, + description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{{my_var}}`.\n- The nested resolution variables like `{{ {{my_nested_var}} }}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {{ {{my_var_a}}:{{my_var_b}} | base64Encoder }}\n + base64Decorer - decode from `base64` encoded string, {{ {{my_string_variable_or_string_value}} | base64Decoder }}\n + urlEncoder - encode the input string to URL-like format, {{ https://test.host.com/endpoint | urlEncoder}}\n + urlDecorer - decode the input url-encoded string into text format, {{ urlDecoder:https%3A%2F%2Fairbyte.io | urlDecoder}}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {{ {{state_value}} | codeChallengeS256 }}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{{client_id_key}}={{client_id_value}}&{{redirect_uri_key}}={{ {{redirect_uri_value}} | urlEncoder}}&{{state_key}}={{state_value}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{{ auth_code_key }}": "{{ auth_code_value }}",\n "{{ client_id_key }}": "{{ client_id_value }}",\n "{{ client_secret_key }}": "{{ client_secret_value }}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', + title="DeclarativeOAuth Connector Specification", + ) ) complete_oauth_output_specification: Optional[Dict[str, Any]] = Field( None, @@ -1113,7 +1119,9 @@ class Config: complete_oauth_server_input_specification: Optional[Dict[str, Any]] = Field( None, description="OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations.\nMust be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the\nserver when completing an OAuth flow (typically exchanging an auth code for refresh token).\nExamples:\n complete_oauth_server_input_specification={\n client_id: {\n type: string\n },\n client_secret: {\n type: string\n }\n }", - examples=[{"client_id": {"type": "string"}, "client_secret": {"type": "string"}}], + examples=[ + {"client_id": {"type": "string"}, "client_secret": {"type": "string"}} + ], title="OAuth input specification", ) complete_oauth_server_output_specification: Optional[Dict[str, Any]] = Field( @@ -1766,7 +1774,9 @@ class RecordSelector(BaseModel): description="Responsible for filtering records to be emitted by the Source.", title="Record Filter", ) - schema_normalization: Optional[Union[SchemaNormalization, CustomSchemaNormalization]] = Field( + schema_normalization: Optional[ + Union[SchemaNormalization, CustomSchemaNormalization] + ] = Field( SchemaNormalization.None_, description="Responsible for normalization according to the schema.", title="Schema Normalization", @@ -1987,7 +1997,9 @@ class Config: description="Component used to fetch data incrementally based on a time field in the data.", title="Incremental Sync", ) - name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") + name: Optional[str] = Field( + "", description="The stream name.", example=["Users"], title="Name" + ) primary_key: Optional[PrimaryKey] = Field( "", description="The primary key of the stream.", title="Primary Key" ) @@ -2261,7 +2273,11 @@ class SimpleRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], + List[ + Union[ + CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter + ] + ], ] ] = Field( [], @@ -2305,7 +2321,9 @@ class AsyncRetriever(BaseModel): ) download_extractor: Optional[ Union[CustomRecordExtractor, DpathExtractor, ResponseToFileExtractor] - ] = Field(None, description="Responsible for fetching the records from provided urls.") + ] = Field( + None, description="Responsible for fetching the records from provided urls." + ) creation_requester: Union[CustomRequester, HttpRequester] = Field( ..., description="Requester component that describes how to prepare HTTP requests to send to the source API to create the async server-side job.", @@ -2339,7 +2357,11 @@ class AsyncRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], + List[ + Union[ + CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter + ] + ], ] ] = Field( [], @@ -2407,10 +2429,12 @@ class DynamicDeclarativeStream(BaseModel): stream_template: DeclarativeStream = Field( ..., description="Reference to the stream template.", title="Stream Template" ) - components_resolver: Union[HttpComponentsResolver, ConfigComponentsResolver] = Field( - ..., - description="Component resolve and populates stream templates with components values.", - title="Components Resolver", + components_resolver: Union[HttpComponentsResolver, ConfigComponentsResolver] = ( + Field( + ..., + description="Component resolve and populates stream templates with components values.", + title="Components Resolver", + ) ) diff --git a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py index d2265b134..687666528 100644 --- a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +++ b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py @@ -34,7 +34,6 @@ from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState from airbyte_cdk.utils.mapping_helpers import combine_mappings - FULL_REFRESH_SYNC_COMPLETE_KEY = "__ab_full_refresh_sync_complete" From dae0b98a58cca9e831032960abc2e81f2ad962d7 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Thu, 13 Mar 2025 00:58:51 +0000 Subject: [PATCH 05/25] Auto-fix lint and format issues --- .../models/declarative_component_schema.py | 80 +++++++------------ .../parsers/model_to_component_factory.py | 25 ++++-- .../substream_partition_router.py | 22 ++++- .../retrievers/simple_retriever.py | 49 +++++++++--- 4 files changed, 105 insertions(+), 71 deletions(-) diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index d66e322e0..f85ae7993 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -609,9 +609,7 @@ class OAuthAuthenticator(BaseModel): scopes: Optional[List[str]] = Field( None, description="List of scopes that should be granted to the access token.", - examples=[ - ["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"] - ], + examples=[["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"]], title="Scopes", ) token_expiry_date: Optional[str] = Field( @@ -1080,28 +1078,24 @@ class OAuthConfigSpecification(BaseModel): class Config: extra = Extra.allow - oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = ( - Field( - None, - description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", - examples=[ - {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, - { - "app_id": { - "type": "string", - "path_in_connector_config": ["info", "app_id"], - } - }, - ], - title="OAuth user input", - ) + oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", + examples=[ + {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, + { + "app_id": { + "type": "string", + "path_in_connector_config": ["info", "app_id"], + } + }, + ], + title="OAuth user input", ) - oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = ( - Field( - None, - description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{{my_var}}`.\n- The nested resolution variables like `{{ {{my_nested_var}} }}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {{ {{my_var_a}}:{{my_var_b}} | base64Encoder }}\n + base64Decorer - decode from `base64` encoded string, {{ {{my_string_variable_or_string_value}} | base64Decoder }}\n + urlEncoder - encode the input string to URL-like format, {{ https://test.host.com/endpoint | urlEncoder}}\n + urlDecorer - decode the input url-encoded string into text format, {{ urlDecoder:https%3A%2F%2Fairbyte.io | urlDecoder}}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {{ {{state_value}} | codeChallengeS256 }}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{{client_id_key}}={{client_id_value}}&{{redirect_uri_key}}={{ {{redirect_uri_value}} | urlEncoder}}&{{state_key}}={{state_value}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{{ auth_code_key }}": "{{ auth_code_value }}",\n "{{ client_id_key }}": "{{ client_id_value }}",\n "{{ client_secret_key }}": "{{ client_secret_value }}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', - title="DeclarativeOAuth Connector Specification", - ) + oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = Field( + None, + description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{{my_var}}`.\n- The nested resolution variables like `{{ {{my_nested_var}} }}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {{ {{my_var_a}}:{{my_var_b}} | base64Encoder }}\n + base64Decorer - decode from `base64` encoded string, {{ {{my_string_variable_or_string_value}} | base64Decoder }}\n + urlEncoder - encode the input string to URL-like format, {{ https://test.host.com/endpoint | urlEncoder}}\n + urlDecorer - decode the input url-encoded string into text format, {{ urlDecoder:https%3A%2F%2Fairbyte.io | urlDecoder}}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {{ {{state_value}} | codeChallengeS256 }}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{{client_id_key}}={{client_id_value}}&{{redirect_uri_key}}={{ {{redirect_uri_value}} | urlEncoder}}&{{state_key}}={{state_value}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{{ auth_code_key }}": "{{ auth_code_value }}",\n "{{ client_id_key }}": "{{ client_id_value }}",\n "{{ client_secret_key }}": "{{ client_secret_value }}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', + title="DeclarativeOAuth Connector Specification", ) complete_oauth_output_specification: Optional[Dict[str, Any]] = Field( None, @@ -1119,9 +1113,7 @@ class Config: complete_oauth_server_input_specification: Optional[Dict[str, Any]] = Field( None, description="OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations.\nMust be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the\nserver when completing an OAuth flow (typically exchanging an auth code for refresh token).\nExamples:\n complete_oauth_server_input_specification={\n client_id: {\n type: string\n },\n client_secret: {\n type: string\n }\n }", - examples=[ - {"client_id": {"type": "string"}, "client_secret": {"type": "string"}} - ], + examples=[{"client_id": {"type": "string"}, "client_secret": {"type": "string"}}], title="OAuth input specification", ) complete_oauth_server_output_specification: Optional[Dict[str, Any]] = Field( @@ -1774,9 +1766,7 @@ class RecordSelector(BaseModel): description="Responsible for filtering records to be emitted by the Source.", title="Record Filter", ) - schema_normalization: Optional[ - Union[SchemaNormalization, CustomSchemaNormalization] - ] = Field( + schema_normalization: Optional[Union[SchemaNormalization, CustomSchemaNormalization]] = Field( SchemaNormalization.None_, description="Responsible for normalization according to the schema.", title="Schema Normalization", @@ -1997,9 +1987,7 @@ class Config: description="Component used to fetch data incrementally based on a time field in the data.", title="Incremental Sync", ) - name: Optional[str] = Field( - "", description="The stream name.", example=["Users"], title="Name" - ) + name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") primary_key: Optional[PrimaryKey] = Field( "", description="The primary key of the stream.", title="Primary Key" ) @@ -2273,11 +2261,7 @@ class SimpleRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[ - Union[ - CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter - ] - ], + List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], ] ] = Field( [], @@ -2321,9 +2305,7 @@ class AsyncRetriever(BaseModel): ) download_extractor: Optional[ Union[CustomRecordExtractor, DpathExtractor, ResponseToFileExtractor] - ] = Field( - None, description="Responsible for fetching the records from provided urls." - ) + ] = Field(None, description="Responsible for fetching the records from provided urls.") creation_requester: Union[CustomRequester, HttpRequester] = Field( ..., description="Requester component that describes how to prepare HTTP requests to send to the source API to create the async server-side job.", @@ -2357,11 +2339,7 @@ class AsyncRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[ - Union[ - CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter - ] - ], + List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], ] ] = Field( [], @@ -2429,12 +2407,10 @@ class DynamicDeclarativeStream(BaseModel): stream_template: DeclarativeStream = Field( ..., description="Reference to the stream template.", title="Stream Template" ) - components_resolver: Union[HttpComponentsResolver, ConfigComponentsResolver] = ( - Field( - ..., - description="Component resolve and populates stream templates with components values.", - title="Components Resolver", - ) + components_resolver: Union[HttpComponentsResolver, ConfigComponentsResolver] = Field( + ..., + description="Component resolve and populates stream templates with components values.", + title="Components Resolver", ) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 987f3b7e5..323cd8563 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2648,7 +2648,11 @@ def create_simple_retriever( stop_condition_on_cursor: bool = False, client_side_incremental_sync: Optional[Dict[str, Any]] = None, transformations: List[RecordTransformation], - incremental_sync: Optional[Union[IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel]] = None, + incremental_sync: Optional[ + Union[ + IncrementingCountCursorModel, DatetimeBasedCursorModel, CustomIncrementalSyncModel + ] + ] = None, **kwargs: Any, ) -> SimpleRetriever: decoder = ( @@ -2707,10 +2711,21 @@ def create_simple_retriever( model.ignore_stream_slicer_parameters_on_paginated_requests or False ) - if model.lazy_read_pointer and not bool(self._connector_state_manager.get_stream_state(name, None)): - lazy_read_pointer = [InterpolatedString.create(path, parameters=model.parameters or {}) for path in model.lazy_read_pointer] - partition_router = self._create_component_from_model(model=model.partition_router, config=config) - stream_slicer = self._create_component_from_model(model=incremental_sync, config=config) if incremental_sync else SinglePartitionRouter(parameters={}) + if model.lazy_read_pointer and not bool( + self._connector_state_manager.get_stream_state(name, None) + ): + lazy_read_pointer = [ + InterpolatedString.create(path, parameters=model.parameters or {}) + for path in model.lazy_read_pointer + ] + partition_router = self._create_component_from_model( + model=model.partition_router, config=config + ) + stream_slicer = ( + self._create_component_from_model(model=incremental_sync, config=config) + if incremental_sync + else SinglePartitionRouter(parameters={}) + ) return LazySimpleRetriever( name=name, diff --git a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py index d5869410d..bae827ac0 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py @@ -6,7 +6,17 @@ import copy import logging from dataclasses import InitVar, dataclass -from typing import TYPE_CHECKING, Any, Iterable, Tuple, List, Mapping, MutableMapping, Optional, Union +from typing import ( + TYPE_CHECKING, + Any, + Iterable, + List, + Mapping, + MutableMapping, + Optional, + Tuple, + Union, +) import dpath @@ -133,7 +143,9 @@ def _get_request_option( parent_config.request_option.inject_into_request(params, value, self.config) return params - def process_parent_record(self, parent_record: Union[AirbyteMessage, Record, Mapping], parent_stream_name: str) -> Tuple[Optional[Mapping], Optional[Mapping]]: + def process_parent_record( + self, parent_record: Union[AirbyteMessage, Record, Mapping], parent_stream_name: str + ) -> Tuple[Optional[Mapping], Optional[Mapping]]: """ Processes and extracts data from a parent record, handling different record types and ensuring only valid types proceed. @@ -150,7 +162,7 @@ def process_parent_record(self, parent_record: Union[AirbyteMessage, Record, Map ) if parent_record.type == MessageType.RECORD: return parent_record.record.data, {} - return None, None # Skip invalid or non-record data + return None, None # Skip invalid or non-record data # Handle Record type if isinstance(parent_record, Record): @@ -200,7 +212,9 @@ def stream_slices(self) -> Iterable[StreamSlice]: # not support either substreams or RFR, but something that needs to be considered once we do for parent_record in parent_stream.read_only_records(): # Process the parent record - parent_record, parent_partition = self.process_parent_record(parent_record, parent_stream.name) + parent_record, parent_partition = self.process_parent_record( + parent_record, parent_stream.name + ) # Skip invalid or non-record data if parent_record is None: diff --git a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py index 687666528..f70bb069a 100644 --- a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +++ b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py @@ -19,7 +19,9 @@ from airbyte_cdk.sources.declarative.partition_routers.single_partition_router import ( SinglePartitionRouter, ) -from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import SubstreamPartitionRouter +from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, +) from airbyte_cdk.sources.declarative.requesters.paginators.no_pagination import NoPagination from airbyte_cdk.sources.declarative.requesters.paginators.paginator import Paginator from airbyte_cdk.sources.declarative.requesters.request_options import ( @@ -640,6 +642,7 @@ class LazySimpleRetriever(SimpleRetriever): """ A retriever that supports lazy loading from parent streams. """ + partition_router: SubstreamPartitionRouter = field(init=True, repr=False, default=None) lazy_read_pointer: Optional[List[InterpolatedString]] = None @@ -653,7 +656,9 @@ def _read_pages( parent_stream = parent_stream_config.stream for parent_record in parent_stream.read_only_records(): - parent_record, parent_partition = self.partition_router.process_parent_record(parent_record, parent_stream.name) + parent_record, parent_partition = self.partition_router.process_parent_record( + parent_record, parent_stream.name + ) if parent_record is None: continue @@ -661,7 +666,12 @@ def _read_pages( response = self._create_response(childs) yield from self._yield_records_with_pagination( - response, records_generator_fn, stream_state, stream_slice, parent_record, parent_stream_config + response, + records_generator_fn, + stream_state, + stream_slice, + parent_record, + parent_stream_config, ) yield from [] @@ -672,7 +682,11 @@ def _extract_child_records(self, parent_record: Mapping) -> Mapping: return parent_record path = [path.eval(self.config) for path in self.lazy_read_pointer] - return dpath.values(parent_record, path) if "*" in path else dpath.get(parent_record, path, default=[]) + return ( + dpath.values(parent_record, path) + if "*" in path + else dpath.get(parent_record, path, default=[]) + ) def _create_response(self, data: Mapping) -> SafeResponse: """Create a SafeResponse with the given data.""" @@ -700,7 +714,14 @@ def _yield_records_with_pagination( next_page_token = self._next_page_token(response, last_page_size, last_record, None) if next_page_token: - yield from self._paginate(next_page_token, records_generator_fn, stream_state, stream_slice, parent_record, parent_stream_config) + yield from self._paginate( + next_page_token, + records_generator_fn, + stream_state, + stream_slice, + parent_record, + parent_stream_config, + ) def _paginate( self, @@ -713,8 +734,13 @@ def _paginate( ) -> Iterable[Record]: """Handle pagination by fetching subsequent pages.""" partition_field = parent_stream_config.partition_field.eval(self.config) - partition_value = dpath.get(parent_record, parent_stream_config.parent_key.eval(self.config)) - stream_slice = StreamSlice(partition={partition_field: partition_value, "parent_slice": {}}, cursor_slice=stream_slice.cursor_slice) + partition_value = dpath.get( + parent_record, parent_stream_config.parent_key.eval(self.config) + ) + stream_slice = StreamSlice( + partition={partition_field: partition_value, "parent_slice": {}}, + cursor_slice=stream_slice.cursor_slice, + ) while next_page_token: response = self._fetch_next_page(stream_state, stream_slice, next_page_token) @@ -725,6 +751,9 @@ def _paginate( last_record = record yield record - last_page_token_value = next_page_token.get("next_page_token") if next_page_token else None - next_page_token = self._next_page_token(response, last_page_size, last_record, last_page_token_value) - + last_page_token_value = ( + next_page_token.get("next_page_token") if next_page_token else None + ) + next_page_token = self._next_page_token( + response, last_page_size, last_record, last_page_token_value + ) From 40efd79019e487f3d33fc61af605845fb2555050 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Thu, 13 Mar 2025 02:00:27 +0100 Subject: [PATCH 06/25] Add LazySimpleRetriever to retrivers --- airbyte_cdk/sources/declarative/retrievers/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/retrievers/__init__.py b/airbyte_cdk/sources/declarative/retrievers/__init__.py index 177d141a3..e087ac94e 100644 --- a/airbyte_cdk/sources/declarative/retrievers/__init__.py +++ b/airbyte_cdk/sources/declarative/retrievers/__init__.py @@ -5,8 +5,9 @@ from airbyte_cdk.sources.declarative.retrievers.async_retriever import AsyncRetriever from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever from airbyte_cdk.sources.declarative.retrievers.simple_retriever import ( + LazySimpleRetriever, SimpleRetriever, SimpleRetrieverTestReadDecorator, ) -__all__ = ["Retriever", "SimpleRetriever", "SimpleRetrieverTestReadDecorator", "AsyncRetriever"] +__all__ = ["Retriever", "SimpleRetriever", "SimpleRetrieverTestReadDecorator", "AsyncRetriever", "LazySimpleRetriever"] From 65b0546803f46094d48ee87bf511a6970adbe18f Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Thu, 13 Mar 2025 02:09:09 +0100 Subject: [PATCH 07/25] Add lazy_read_pointer model --- .../declarative_component_schema.yaml | 9 ++ .../models/declarative_component_schema.py | 85 +++++++++++++------ 2 files changed, 66 insertions(+), 28 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index c6a263002..67004f49f 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -3208,6 +3208,15 @@ definitions: - "$ref": "#/definitions/IterableDecoder" - "$ref": "#/definitions/XmlDecoder" - "$ref": "#/definitions/ZipfileDecoder" + lazy_read_pointer: + title: Lazy Read Pointer + description: If set, this will enable lazy reading, using the initial read of parent records to extract child records. + type: array + default: [ ] + items: + - type: string + interpolation_context: + - config $parameters: type: object additionalProperties: true diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index f85ae7993..676803589 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -609,7 +609,9 @@ class OAuthAuthenticator(BaseModel): scopes: Optional[List[str]] = Field( None, description="List of scopes that should be granted to the access token.", - examples=[["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"]], + examples=[ + ["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"] + ], title="Scopes", ) token_expiry_date: Optional[str] = Field( @@ -1078,24 +1080,28 @@ class OAuthConfigSpecification(BaseModel): class Config: extra = Extra.allow - oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = Field( - None, - description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", - examples=[ - {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, - { - "app_id": { - "type": "string", - "path_in_connector_config": ["info", "app_id"], - } - }, - ], - title="OAuth user input", + oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = ( + Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", + examples=[ + {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, + { + "app_id": { + "type": "string", + "path_in_connector_config": ["info", "app_id"], + } + }, + ], + title="OAuth user input", + ) ) - oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = Field( - None, - description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{{my_var}}`.\n- The nested resolution variables like `{{ {{my_nested_var}} }}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {{ {{my_var_a}}:{{my_var_b}} | base64Encoder }}\n + base64Decorer - decode from `base64` encoded string, {{ {{my_string_variable_or_string_value}} | base64Decoder }}\n + urlEncoder - encode the input string to URL-like format, {{ https://test.host.com/endpoint | urlEncoder}}\n + urlDecorer - decode the input url-encoded string into text format, {{ urlDecoder:https%3A%2F%2Fairbyte.io | urlDecoder}}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {{ {{state_value}} | codeChallengeS256 }}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{{client_id_key}}={{client_id_value}}&{{redirect_uri_key}}={{ {{redirect_uri_value}} | urlEncoder}}&{{state_key}}={{state_value}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{{ auth_code_key }}": "{{ auth_code_value }}",\n "{{ client_id_key }}": "{{ client_id_value }}",\n "{{ client_secret_key }}": "{{ client_secret_value }}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', - title="DeclarativeOAuth Connector Specification", + oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = ( + Field( + None, + description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{{my_var}}`.\n- The nested resolution variables like `{{ {{my_nested_var}} }}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {{ {{my_var_a}}:{{my_var_b}} | base64Encoder }}\n + base64Decorer - decode from `base64` encoded string, {{ {{my_string_variable_or_string_value}} | base64Decoder }}\n + urlEncoder - encode the input string to URL-like format, {{ https://test.host.com/endpoint | urlEncoder}}\n + urlDecorer - decode the input url-encoded string into text format, {{ urlDecoder:https%3A%2F%2Fairbyte.io | urlDecoder}}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {{ {{state_value}} | codeChallengeS256 }}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{{client_id_key}}={{client_id_value}}&{{redirect_uri_key}}={{ {{redirect_uri_value}} | urlEncoder}}&{{state_key}}={{state_value}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{{ auth_code_key }}": "{{ auth_code_value }}",\n "{{ client_id_key }}": "{{ client_id_value }}",\n "{{ client_secret_key }}": "{{ client_secret_value }}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', + title="DeclarativeOAuth Connector Specification", + ) ) complete_oauth_output_specification: Optional[Dict[str, Any]] = Field( None, @@ -1113,7 +1119,9 @@ class Config: complete_oauth_server_input_specification: Optional[Dict[str, Any]] = Field( None, description="OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations.\nMust be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the\nserver when completing an OAuth flow (typically exchanging an auth code for refresh token).\nExamples:\n complete_oauth_server_input_specification={\n client_id: {\n type: string\n },\n client_secret: {\n type: string\n }\n }", - examples=[{"client_id": {"type": "string"}, "client_secret": {"type": "string"}}], + examples=[ + {"client_id": {"type": "string"}, "client_secret": {"type": "string"}} + ], title="OAuth input specification", ) complete_oauth_server_output_specification: Optional[Dict[str, Any]] = Field( @@ -1766,7 +1774,9 @@ class RecordSelector(BaseModel): description="Responsible for filtering records to be emitted by the Source.", title="Record Filter", ) - schema_normalization: Optional[Union[SchemaNormalization, CustomSchemaNormalization]] = Field( + schema_normalization: Optional[ + Union[SchemaNormalization, CustomSchemaNormalization] + ] = Field( SchemaNormalization.None_, description="Responsible for normalization according to the schema.", title="Schema Normalization", @@ -1987,7 +1997,9 @@ class Config: description="Component used to fetch data incrementally based on a time field in the data.", title="Incremental Sync", ) - name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") + name: Optional[str] = Field( + "", description="The stream name.", example=["Users"], title="Name" + ) primary_key: Optional[PrimaryKey] = Field( "", description="The primary key of the stream.", title="Primary Key" ) @@ -2261,7 +2273,11 @@ class SimpleRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], + List[ + Union[ + CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter + ] + ], ] ] = Field( [], @@ -2284,6 +2300,11 @@ class SimpleRetriever(BaseModel): description="Component decoding the response so records can be extracted.", title="Decoder", ) + lazy_read_pointer: Optional[List[str]] = Field( + [], + description="If set, this will enable lazy reading, using the initial read of parent records to extract child records.", + title="Lazy Read Pointer", + ) parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") @@ -2305,7 +2326,9 @@ class AsyncRetriever(BaseModel): ) download_extractor: Optional[ Union[CustomRecordExtractor, DpathExtractor, ResponseToFileExtractor] - ] = Field(None, description="Responsible for fetching the records from provided urls.") + ] = Field( + None, description="Responsible for fetching the records from provided urls." + ) creation_requester: Union[CustomRequester, HttpRequester] = Field( ..., description="Requester component that describes how to prepare HTTP requests to send to the source API to create the async server-side job.", @@ -2339,7 +2362,11 @@ class AsyncRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], + List[ + Union[ + CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter + ] + ], ] ] = Field( [], @@ -2407,10 +2434,12 @@ class DynamicDeclarativeStream(BaseModel): stream_template: DeclarativeStream = Field( ..., description="Reference to the stream template.", title="Stream Template" ) - components_resolver: Union[HttpComponentsResolver, ConfigComponentsResolver] = Field( - ..., - description="Component resolve and populates stream templates with components values.", - title="Components Resolver", + components_resolver: Union[HttpComponentsResolver, ConfigComponentsResolver] = ( + Field( + ..., + description="Component resolve and populates stream templates with components values.", + title="Components Resolver", + ) ) From 74dbe15e3e2ae1417966e23b07272167e7e73935 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Thu, 13 Mar 2025 10:23:07 +0100 Subject: [PATCH 08/25] Fix mypy --- .../parsers/model_to_component_factory.py | 16 +++++- .../substream_partition_router.py | 24 ++++----- .../retrievers/simple_retriever.py | 52 ++++++++++++------- 3 files changed, 58 insertions(+), 34 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 323cd8563..3144bd30d 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2711,15 +2711,27 @@ def create_simple_retriever( model.ignore_stream_slicer_parameters_on_paginated_requests or False ) + if model.lazy_read_pointer and not hasattr(model, "partition_router"): + raise ValueError( + "LazySimpleRetriever requires a 'partition_router' when 'lazy_read_pointer' is set. " + "Please either define 'partition_router' or remove 'lazy_read_pointer' from the model." + ) + if model.lazy_read_pointer and not bool( - self._connector_state_manager.get_stream_state(name, None) + self._connector_state_manager.get_stream_state(name, None) ): + if model.partition_router.type != "SubstreamPartitionRouterModel": # type: ignore[union-attr] # model.partition_router has BaseModel type + raise ValueError( + "LazySimpleRetriever only supports 'SubstreamPartitionRouterModel' as the 'partition_router' type. " # type: ignore[union-attr] # model.partition_router has BaseModel type + f"Found: '{model.partition_router.type}'." + ) + lazy_read_pointer = [ InterpolatedString.create(path, parameters=model.parameters or {}) for path in model.lazy_read_pointer ] partition_router = self._create_component_from_model( - model=model.partition_router, config=config + model=model.partition_router, config=config # type: ignore[arg-type] ) stream_slicer = ( self._create_component_from_model(model=incremental_sync, config=config) diff --git a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py index bae827ac0..a82c4b2ce 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py @@ -144,8 +144,8 @@ def _get_request_option( return params def process_parent_record( - self, parent_record: Union[AirbyteMessage, Record, Mapping], parent_stream_name: str - ) -> Tuple[Optional[Mapping], Optional[Mapping]]: + self, parent_record: Union[AirbyteMessage, Record, Mapping[str, Any]], parent_stream_name: str + ) -> Tuple[Optional[MutableMapping[str, Any]], Optional[MutableMapping[str, Any]]]: """ Processes and extracts data from a parent record, handling different record types and ensuring only valid types proceed. @@ -161,23 +161,21 @@ def process_parent_record( f"This SubstreamPartitionRouter is not able to checkpoint incremental parent state." ) if parent_record.type == MessageType.RECORD: - return parent_record.record.data, {} + return parent_record.record.data, {} # type: ignore[union-attr] # parent_record.record is always AirbyteRecordMessage return None, None # Skip invalid or non-record data - # Handle Record type if isinstance(parent_record, Record): parent_partition = ( parent_record.associated_slice.partition if parent_record.associated_slice else {} ) - return parent_record.data, parent_partition + return {**parent_record.data}, {**parent_partition} - # Validate the record type - if not isinstance(parent_record, Mapping): - raise AirbyteTracedException( - message=f"Parent stream returned records as invalid type {type(parent_record)}" - ) + if isinstance(parent_record, Mapping): + return {**parent_record}, {} - return parent_record, {} + raise AirbyteTracedException( + message=f"Parent stream returned records as invalid type {type(parent_record)}" + ) def stream_slices(self) -> Iterable[StreamSlice]: """ @@ -210,10 +208,10 @@ def stream_slices(self) -> Iterable[StreamSlice]: # read_stateless() assumes the parent is not concurrent. This is currently okay since the concurrent CDK does # not support either substreams or RFR, but something that needs to be considered once we do - for parent_record in parent_stream.read_only_records(): + for raw_parent_record in parent_stream.read_only_records(): # Process the parent record parent_record, parent_partition = self.process_parent_record( - parent_record, parent_stream.name + raw_parent_record, parent_stream.name ) # Skip invalid or non-record data diff --git a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py index f70bb069a..b351fca89 100644 --- a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +++ b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py @@ -6,10 +6,11 @@ from dataclasses import InitVar, dataclass, field from functools import partial from itertools import islice -from typing import Any, Callable, Iterable, List, Mapping, Optional, Set, Tuple, Union +from typing import Any, Callable, Iterable, List, Mapping, Optional, Set, Tuple, Union, MutableMapping import dpath import requests +from typing_extensions import deprecated from airbyte_cdk.models import AirbyteMessage from airbyte_cdk.sources.declarative.extractors.http_selector import HttpSelector @@ -35,6 +36,7 @@ from airbyte_cdk.sources.streams.core import StreamData from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState from airbyte_cdk.utils.mapping_helpers import combine_mappings +from airbyte_cdk.sources.source import ExperimentalClassWarning FULL_REFRESH_SYNC_COMPLETE_KEY = "__ab_full_refresh_sync_complete" @@ -625,25 +627,29 @@ def _fetch_next_page( class SafeResponse(requests.Response): - def __getattr__(self, name): + def __getattr__(self, name: str) -> Any: return getattr(requests.Response, name, None) @property - def content(self): + def content(self) -> Optional[bytes]: return super().content @content.setter - def content(self, value): + def content(self, value: Union[str, bytes]) -> None: self._content = value.encode() if isinstance(value, str) else value +@deprecated( + "This class is experimental. Use at your own risk.", + category=ExperimentalClassWarning, +) @dataclass class LazySimpleRetriever(SimpleRetriever): """ A retriever that supports lazy loading from parent streams. """ - partition_router: SubstreamPartitionRouter = field(init=True, repr=False, default=None) + partition_router: SubstreamPartitionRouter = field(init=True, repr=False, default=None) # type: ignore[assignment] # 'partition_router' is required for LazySimpleRetriever and is validated in the constructor lazy_read_pointer: Optional[List[InterpolatedString]] = None def _read_pages( @@ -655,9 +661,9 @@ def _read_pages( parent_stream_config = self.partition_router.parent_stream_configs[-1] parent_stream = parent_stream_config.stream - for parent_record in parent_stream.read_only_records(): + for raw_parent_record in parent_stream.read_only_records(): parent_record, parent_partition = self.partition_router.process_parent_record( - parent_record, parent_stream.name + raw_parent_record, parent_stream.name ) if parent_record is None: continue @@ -676,19 +682,19 @@ def _read_pages( yield from [] - def _extract_child_records(self, parent_record: Mapping) -> Mapping: + def _extract_child_records(self, parent_record: MutableMapping[str, Any]) -> MutableMapping[str, Any]: """Extract child records from a parent record based on lazy pointers.""" if not self.lazy_read_pointer: return parent_record path = [path.eval(self.config) for path in self.lazy_read_pointer] return ( - dpath.values(parent_record, path) + dpath.values(parent_record, path) # type: ignore # return value will be a MutableMapping, given input data structure if "*" in path else dpath.get(parent_record, path, default=[]) ) - def _create_response(self, data: Mapping) -> SafeResponse: + def _create_response(self, data: Mapping[str, Any]) -> SafeResponse: """Create a SafeResponse with the given data.""" response = SafeResponse() response.content = json.dumps(data).encode("utf-8") @@ -701,7 +707,7 @@ def _yield_records_with_pagination( records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]], stream_state: Mapping[str, Any], stream_slice: StreamSlice, - parent_record: Record, + parent_record: MutableMapping[str, Any], parent_stream_config: Any, ) -> Iterable[Record]: """Yield records, handling pagination if needed.""" @@ -729,7 +735,7 @@ def _paginate( records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]], stream_state: Mapping[str, Any], stream_slice: StreamSlice, - parent_record: Record, + parent_record: MutableMapping[str, Any], parent_stream_config: Any, ) -> Iterable[Record]: """Handle pagination by fetching subsequent pages.""" @@ -742,7 +748,9 @@ def _paginate( cursor_slice=stream_slice.cursor_slice, ) - while next_page_token: + pagination_complete = False + + while not pagination_complete: response = self._fetch_next_page(stream_state, stream_slice, next_page_token) last_page_size, last_record = 0, None @@ -751,9 +759,15 @@ def _paginate( last_record = record yield record - last_page_token_value = ( - next_page_token.get("next_page_token") if next_page_token else None - ) - next_page_token = self._next_page_token( - response, last_page_size, last_record, last_page_token_value - ) + if not response: + pagination_complete = True + else: + last_page_token_value = ( + next_page_token.get("next_page_token") if next_page_token else None + ) + next_page_token = self._next_page_token( + response, last_page_size, last_record, last_page_token_value + ) + + if not next_page_token: + pagination_complete = True From b53760220be66aea2f131b2adee009006d5a6528 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Thu, 13 Mar 2025 10:25:09 +0100 Subject: [PATCH 09/25] Fix typo --- .../sources/declarative/retrievers/simple_retriever.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py index b351fca89..82f4d73b1 100644 --- a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +++ b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py @@ -668,8 +668,8 @@ def _read_pages( if parent_record is None: continue - childs = self._extract_child_records(parent_record) - response = self._create_response(childs) + child_records = self._extract_child_records(parent_record) + response = self._create_response(child_records) yield from self._yield_records_with_pagination( response, From 33b9f83ab3eb1f972a74e4b99aeebe344a42dc3d Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Thu, 13 Mar 2025 09:26:24 +0000 Subject: [PATCH 10/25] Auto-fix lint and format issues --- .../models/declarative_component_schema.py | 80 +++++++------------ .../parsers/model_to_component_factory.py | 5 +- .../substream_partition_router.py | 4 +- .../declarative/retrievers/__init__.py | 8 +- .../retrievers/simple_retriever.py | 19 ++++- 5 files changed, 57 insertions(+), 59 deletions(-) diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 676803589..c1325c87b 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -609,9 +609,7 @@ class OAuthAuthenticator(BaseModel): scopes: Optional[List[str]] = Field( None, description="List of scopes that should be granted to the access token.", - examples=[ - ["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"] - ], + examples=[["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"]], title="Scopes", ) token_expiry_date: Optional[str] = Field( @@ -1080,28 +1078,24 @@ class OAuthConfigSpecification(BaseModel): class Config: extra = Extra.allow - oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = ( - Field( - None, - description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", - examples=[ - {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, - { - "app_id": { - "type": "string", - "path_in_connector_config": ["info", "app_id"], - } - }, - ], - title="OAuth user input", - ) + oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", + examples=[ + {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, + { + "app_id": { + "type": "string", + "path_in_connector_config": ["info", "app_id"], + } + }, + ], + title="OAuth user input", ) - oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = ( - Field( - None, - description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{{my_var}}`.\n- The nested resolution variables like `{{ {{my_nested_var}} }}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {{ {{my_var_a}}:{{my_var_b}} | base64Encoder }}\n + base64Decorer - decode from `base64` encoded string, {{ {{my_string_variable_or_string_value}} | base64Decoder }}\n + urlEncoder - encode the input string to URL-like format, {{ https://test.host.com/endpoint | urlEncoder}}\n + urlDecorer - decode the input url-encoded string into text format, {{ urlDecoder:https%3A%2F%2Fairbyte.io | urlDecoder}}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {{ {{state_value}} | codeChallengeS256 }}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{{client_id_key}}={{client_id_value}}&{{redirect_uri_key}}={{ {{redirect_uri_value}} | urlEncoder}}&{{state_key}}={{state_value}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{{ auth_code_key }}": "{{ auth_code_value }}",\n "{{ client_id_key }}": "{{ client_id_value }}",\n "{{ client_secret_key }}": "{{ client_secret_value }}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', - title="DeclarativeOAuth Connector Specification", - ) + oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = Field( + None, + description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{{my_var}}`.\n- The nested resolution variables like `{{ {{my_nested_var}} }}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {{ {{my_var_a}}:{{my_var_b}} | base64Encoder }}\n + base64Decorer - decode from `base64` encoded string, {{ {{my_string_variable_or_string_value}} | base64Decoder }}\n + urlEncoder - encode the input string to URL-like format, {{ https://test.host.com/endpoint | urlEncoder}}\n + urlDecorer - decode the input url-encoded string into text format, {{ urlDecoder:https%3A%2F%2Fairbyte.io | urlDecoder}}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {{ {{state_value}} | codeChallengeS256 }}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{{client_id_key}}={{client_id_value}}&{{redirect_uri_key}}={{ {{redirect_uri_value}} | urlEncoder}}&{{state_key}}={{state_value}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{{ auth_code_key }}": "{{ auth_code_value }}",\n "{{ client_id_key }}": "{{ client_id_value }}",\n "{{ client_secret_key }}": "{{ client_secret_value }}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', + title="DeclarativeOAuth Connector Specification", ) complete_oauth_output_specification: Optional[Dict[str, Any]] = Field( None, @@ -1119,9 +1113,7 @@ class Config: complete_oauth_server_input_specification: Optional[Dict[str, Any]] = Field( None, description="OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations.\nMust be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the\nserver when completing an OAuth flow (typically exchanging an auth code for refresh token).\nExamples:\n complete_oauth_server_input_specification={\n client_id: {\n type: string\n },\n client_secret: {\n type: string\n }\n }", - examples=[ - {"client_id": {"type": "string"}, "client_secret": {"type": "string"}} - ], + examples=[{"client_id": {"type": "string"}, "client_secret": {"type": "string"}}], title="OAuth input specification", ) complete_oauth_server_output_specification: Optional[Dict[str, Any]] = Field( @@ -1774,9 +1766,7 @@ class RecordSelector(BaseModel): description="Responsible for filtering records to be emitted by the Source.", title="Record Filter", ) - schema_normalization: Optional[ - Union[SchemaNormalization, CustomSchemaNormalization] - ] = Field( + schema_normalization: Optional[Union[SchemaNormalization, CustomSchemaNormalization]] = Field( SchemaNormalization.None_, description="Responsible for normalization according to the schema.", title="Schema Normalization", @@ -1997,9 +1987,7 @@ class Config: description="Component used to fetch data incrementally based on a time field in the data.", title="Incremental Sync", ) - name: Optional[str] = Field( - "", description="The stream name.", example=["Users"], title="Name" - ) + name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") primary_key: Optional[PrimaryKey] = Field( "", description="The primary key of the stream.", title="Primary Key" ) @@ -2273,11 +2261,7 @@ class SimpleRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[ - Union[ - CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter - ] - ], + List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], ] ] = Field( [], @@ -2326,9 +2310,7 @@ class AsyncRetriever(BaseModel): ) download_extractor: Optional[ Union[CustomRecordExtractor, DpathExtractor, ResponseToFileExtractor] - ] = Field( - None, description="Responsible for fetching the records from provided urls." - ) + ] = Field(None, description="Responsible for fetching the records from provided urls.") creation_requester: Union[CustomRequester, HttpRequester] = Field( ..., description="Requester component that describes how to prepare HTTP requests to send to the source API to create the async server-side job.", @@ -2362,11 +2344,7 @@ class AsyncRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[ - Union[ - CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter - ] - ], + List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], ] ] = Field( [], @@ -2434,12 +2412,10 @@ class DynamicDeclarativeStream(BaseModel): stream_template: DeclarativeStream = Field( ..., description="Reference to the stream template.", title="Stream Template" ) - components_resolver: Union[HttpComponentsResolver, ConfigComponentsResolver] = ( - Field( - ..., - description="Component resolve and populates stream templates with components values.", - title="Components Resolver", - ) + components_resolver: Union[HttpComponentsResolver, ConfigComponentsResolver] = Field( + ..., + description="Component resolve and populates stream templates with components values.", + title="Components Resolver", ) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 3144bd30d..4d7fd0713 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2718,7 +2718,7 @@ def create_simple_retriever( ) if model.lazy_read_pointer and not bool( - self._connector_state_manager.get_stream_state(name, None) + self._connector_state_manager.get_stream_state(name, None) ): if model.partition_router.type != "SubstreamPartitionRouterModel": # type: ignore[union-attr] # model.partition_router has BaseModel type raise ValueError( @@ -2731,7 +2731,8 @@ def create_simple_retriever( for path in model.lazy_read_pointer ] partition_router = self._create_component_from_model( - model=model.partition_router, config=config # type: ignore[arg-type] + model=model.partition_router, + config=config, # type: ignore[arg-type] ) stream_slicer = ( self._create_component_from_model(model=incremental_sync, config=config) diff --git a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py index a82c4b2ce..2a58a25b8 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py @@ -144,7 +144,9 @@ def _get_request_option( return params def process_parent_record( - self, parent_record: Union[AirbyteMessage, Record, Mapping[str, Any]], parent_stream_name: str + self, + parent_record: Union[AirbyteMessage, Record, Mapping[str, Any]], + parent_stream_name: str, ) -> Tuple[Optional[MutableMapping[str, Any]], Optional[MutableMapping[str, Any]]]: """ Processes and extracts data from a parent record, handling different record types diff --git a/airbyte_cdk/sources/declarative/retrievers/__init__.py b/airbyte_cdk/sources/declarative/retrievers/__init__.py index e087ac94e..5b26220e0 100644 --- a/airbyte_cdk/sources/declarative/retrievers/__init__.py +++ b/airbyte_cdk/sources/declarative/retrievers/__init__.py @@ -10,4 +10,10 @@ SimpleRetrieverTestReadDecorator, ) -__all__ = ["Retriever", "SimpleRetriever", "SimpleRetrieverTestReadDecorator", "AsyncRetriever", "LazySimpleRetriever"] +__all__ = [ + "Retriever", + "SimpleRetriever", + "SimpleRetrieverTestReadDecorator", + "AsyncRetriever", + "LazySimpleRetriever", +] diff --git a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py index 82f4d73b1..17623a70c 100644 --- a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +++ b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py @@ -6,7 +6,18 @@ from dataclasses import InitVar, dataclass, field from functools import partial from itertools import islice -from typing import Any, Callable, Iterable, List, Mapping, Optional, Set, Tuple, Union, MutableMapping +from typing import ( + Any, + Callable, + Iterable, + List, + Mapping, + MutableMapping, + Optional, + Set, + Tuple, + Union, +) import dpath import requests @@ -33,10 +44,10 @@ from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer import StreamSlicer from airbyte_cdk.sources.http_logger import format_http_message +from airbyte_cdk.sources.source import ExperimentalClassWarning from airbyte_cdk.sources.streams.core import StreamData from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState from airbyte_cdk.utils.mapping_helpers import combine_mappings -from airbyte_cdk.sources.source import ExperimentalClassWarning FULL_REFRESH_SYNC_COMPLETE_KEY = "__ab_full_refresh_sync_complete" @@ -682,7 +693,9 @@ def _read_pages( yield from [] - def _extract_child_records(self, parent_record: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + def _extract_child_records( + self, parent_record: MutableMapping[str, Any] + ) -> MutableMapping[str, Any]: """Extract child records from a parent record based on lazy pointers.""" if not self.lazy_read_pointer: return parent_record From af80b234a2738bad5eaa7acbd5606ec7df4792ee Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Thu, 13 Mar 2025 10:30:35 +0100 Subject: [PATCH 11/25] Fix mypy after fromating --- .../sources/declarative/parsers/model_to_component_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 4d7fd0713..5ea6d5752 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2731,7 +2731,7 @@ def create_simple_retriever( for path in model.lazy_read_pointer ] partition_router = self._create_component_from_model( - model=model.partition_router, + model=model.partition_router, # type: ignore[arg-type] # model.partition_router has BaseModel type config=config, # type: ignore[arg-type] ) stream_slicer = ( From ed4ea743638c765e3686bf128897352e6ba4741a Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Thu, 13 Mar 2025 11:30:13 +0100 Subject: [PATCH 12/25] Add unit tests --- .../parsers/model_to_component_factory.py | 4 +- .../strategies/cursor_pagination_strategy.py | 1 - .../retrievers/test_lazy_simple_retriever.py | 335 ++++++++++++++++++ 3 files changed, 338 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 4dfd495d1..d9f79e77b 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -1746,6 +1746,7 @@ def create_declarative_stream( transformations.append( self._create_component_from_model(model=transformation_model, config=config) ) + retriever = self._create_component_from_model( model=model.retriever, config=config, @@ -1756,6 +1757,7 @@ def create_declarative_stream( stop_condition_on_cursor=stop_condition_on_cursor, client_side_incremental_sync=client_side_incremental_sync, transformations=transformations, + incremental_sync=model.incremental_sync, ) cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None @@ -2747,7 +2749,7 @@ def create_simple_retriever( if model.lazy_read_pointer and not bool( self._connector_state_manager.get_stream_state(name, None) ): - if model.partition_router.type != "SubstreamPartitionRouterModel": # type: ignore[union-attr] # model.partition_router has BaseModel type + if model.partition_router.type != "SubstreamPartitionRouter": # type: ignore[union-attr] # model.partition_router has BaseModel type raise ValueError( "LazySimpleRetriever only supports 'SubstreamPartitionRouterModel' as the 'partition_router' type. " # type: ignore[union-attr] # model.partition_router has BaseModel type f"Found: '{model.partition_router.type}'." diff --git a/airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py b/airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py index e35c84c7c..8df5ce66f 100644 --- a/airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +++ b/airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py @@ -71,7 +71,6 @@ def next_page_token( last_page_token_value: Optional[Any] = None, ) -> Optional[Any]: decoded_response = next(self.decoder.decode(response)) - # The default way that link is presented in requests.Response is a string of various links (last, next, etc). This # is not indexable or useful for parsing the cursor, so we replace it with the link dictionary from response.links headers: Dict[str, Any] = dict(response.headers) diff --git a/unit_tests/sources/declarative/retrievers/test_lazy_simple_retriever.py b/unit_tests/sources/declarative/retrievers/test_lazy_simple_retriever.py index 2acb8555b..32f353400 100644 --- a/unit_tests/sources/declarative/retrievers/test_lazy_simple_retriever.py +++ b/unit_tests/sources/declarative/retrievers/test_lazy_simple_retriever.py @@ -1,3 +1,338 @@ # # Copyright (c) 2025 Airbyte, Inc., all rights reserved. # + +import json +from unittest.mock import MagicMock + +import freezegun + +from airbyte_cdk.models import ( + AirbyteStateBlob, + AirbyteStateMessage, + AirbyteStateType, + AirbyteStreamState, + ConfiguredAirbyteCatalog, + ConfiguredAirbyteStream, + DestinationSyncMode, + StreamDescriptor, + Type, +) +from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( + ConcurrentDeclarativeSource, +) +from airbyte_cdk.test.mock_http import HttpMocker, HttpRequest, HttpResponse + +_CONFIG = {"start_date": "2024-07-01T00:00:00.000Z"} +_MANIFEST = { + "version": "6.0.0", + "type": "DeclarativeSource", + "check": {"type": "CheckStream", "stream_names": ["TestStream"]}, + "definitions": { + "TestStream": { + "type": "DeclarativeStream", + "name": "TestStream", + "primary_key": [], + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": {}, + "type": "object", + }, + }, + "retriever": { + "type": "SimpleRetriever", + "lazy_read_pointer": ["items"], + "requester": { + "type": "HttpRequester", + "url_base": "https://api.test.com", + "path": "parent/{{ stream_partition.parent_id }}/items", + "http_method": "GET", + "authenticator": { + "type": "ApiKeyAuthenticator", + "header": "apikey", + "api_token": "{{ config['api_key'] }}", + }, + }, + "record_selector": { + "type": "RecordSelector", + "extractor": {"type": "DpathExtractor", "field_path": ["data"]}, + }, + "paginator": { + "type": "DefaultPaginator", + "page_token_option": { + "type": "RequestOption", + "inject_into": "request_parameter", + "field_name": "starting_after" + }, + "pagination_strategy": { + "type": "CursorPagination", + "cursor_value": '{{ response["data"][-1]["id"] }}', + "stop_condition": '{{ not response.get("has_more", False) }}' + } + }, + "partition_router": { + "type": "SubstreamPartitionRouter", + "parent_stream_configs": [ + { + "type": "ParentStreamConfig", + "parent_key": "id", + "partition_field": "parent_id", + "stream": { + "type": "DeclarativeStream", + "name": "parent", + "retriever": { + "type": "SimpleRetriever", + "requester": { + "type": "HttpRequester", + "url_base": "https://api.test.com", + "path": "/parents", + "http_method": "GET", + "authenticator": { + "type": "ApiKeyAuthenticator", + "header": "apikey", + "api_token": "{{ config['api_key'] }}", + }, + }, + "record_selector": { + "type": "RecordSelector", + "extractor": { + "type": "DpathExtractor", + "field_path": ["data"], + }, + }, + }, + "schema_loader": { + "type": "InlineSchemaLoader", + "schema": { + "$schema": "http://json-schema.org/schema#", + "properties": {"id": {"type": "integer"}}, + "type": "object", + }, + }, + }, + } + ], + }, + }, + "incremental_sync": { + "type": "DatetimeBasedCursor", + "start_datetime": { + "datetime": "{{ format_datetime(config['start_date'], '%Y-%m-%d') }}" + }, + "end_datetime": {"datetime": "{{ now_utc().strftime('%Y-%m-%d') }}"}, + "datetime_format": "%Y-%m-%d", + "cursor_datetime_formats": ["%Y-%m-%d", "%Y-%m-%dT%H:%M:%S"], + "cursor_granularity": "P1D", + "step": "P15D", + "cursor_field": "updated_at", + "start_time_option": { + "type": "RequestOption", + "field_name": "start", + "inject_into": "request_parameter", + }, + "end_time_option": { + "type": "RequestOption", + "field_name": "end", + "inject_into": "request_parameter", + }, + }, + }, + }, + "streams": [{"$ref": "#/definitions/TestStream"}], + "spec": { + "connection_specification": { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": [], + "properties": {}, + "additionalProperties": True, + }, + "documentation_url": "https://example.org", + "type": "Spec", + }, +} + + +def to_configured_stream( + stream, + sync_mode=None, + destination_sync_mode=DestinationSyncMode.append, + cursor_field=None, + primary_key=None, +) -> ConfiguredAirbyteStream: + return ConfiguredAirbyteStream( + stream=stream, + sync_mode=sync_mode, + destination_sync_mode=destination_sync_mode, + cursor_field=cursor_field, + primary_key=primary_key, + ) + + +def to_configured_catalog( + configured_streams, +) -> ConfiguredAirbyteCatalog: + return ConfiguredAirbyteCatalog(streams=configured_streams) + + +def create_configured_catalog( + source: ConcurrentDeclarativeSource, config: dict +) -> ConfiguredAirbyteCatalog: + """ + Discovers streams from the source and converts them into a configured catalog. + """ + actual_catalog = source.discover(logger=source.logger, config=config) + configured_streams = [ + to_configured_stream(stream, primary_key=stream.source_defined_primary_key) + for stream in actual_catalog.streams + ] + return to_configured_catalog(configured_streams) + + +def get_records( + source: ConcurrentDeclarativeSource, + config: dict, + catalog: ConfiguredAirbyteCatalog, + state: list = None, +) -> list: + """ + Reads records from the source given a configuration, catalog, and optional state. + Returns a list of record data dictionaries. + """ + return [ + message.record.data + for message in source.read(logger=MagicMock(), config=config, catalog=catalog, state=state) + if message.type == Type.RECORD + ] + + +@freezegun.freeze_time("2024-07-15") +def test_retriever_with_lazy_reading(): + """Test the lazy loading behavior of the SimpleRetriever with paginated substream data.""" + with HttpMocker() as http_mocker: + http_mocker.get( + HttpRequest(url="https://api.test.com/parents"), + HttpResponse( + body=json.dumps( + { + "data": [ + {"id": 1, "name": "parent_1", "updated_at": "2024-07-13", "items": {"data": [{"id": 1}, {"id": 2}], "has_more": True}}, + {"id": 2, "name": "parent_2", "updated_at": "2024-07-13", "items": {"data": [{"id": 3}, {"id": 4}], "has_more": False}}, + ], + "has_more": False + } + ) + ), + ) + + http_mocker.get( + HttpRequest(url="https://api.test.com/parent/1/items?starting_after=2&start=2024-07-01&end=2024-07-15"), + HttpResponse( + body=json.dumps( + { + "data": [{"id": 5}, {"id": 6}], + "has_more": True + } + ) + ), + ) + + http_mocker.get( + HttpRequest(url="https://api.test.com/parent/1/items?starting_after=6&start=2024-07-01&end=2024-07-15"), + HttpResponse( + body=json.dumps( + { + "data": [{"id": 7}], + "has_more": False + } + ) + ), + ) + + source = ConcurrentDeclarativeSource( + source_config=_MANIFEST, config=_CONFIG, catalog=None, state=None + ) + configured_catalog = create_configured_catalog(source, _CONFIG) + + # Test full data retrieval (without state) + full_records = get_records(source, _CONFIG, configured_catalog) + expected_full = [ + {"id": 1}, + {"id": 2}, + {"id": 5}, + {"id": 6}, + {"id": 7}, + {"id": 3}, + {"id": 4}, + ] + assert expected_full == full_records + + +@freezegun.freeze_time("2024-07-15") +def test_incremental_sync_with_state(): + """Test incremental sync behavior using state to fetch only new records.""" + with HttpMocker() as http_mocker: + http_mocker.get( + HttpRequest(url="https://api.test.com/parents"), + HttpResponse( + body=json.dumps( + { + "data": [ + {"id": 1, "name": "parent_1", "updated_at": "2024-07-13", + "items": {"data": [{"id": 1}, {"id": 2}], "has_more": False}}, + {"id": 2, "name": "parent_2", "updated_at": "2024-07-13", + "items": {"data": [{"id": 3}, {"id": 4}], "has_more": False}}, + ], + "has_more": False + } + ) + ), + ) + + http_mocker.get( + HttpRequest(url="https://api.test.com/parent/1/items?start=2024-07-13&end=2024-07-15"), + HttpResponse( + body=json.dumps( + { + "data": [{"id": 10, "updated_at": "2024-07-13"}], + "has_more": False + } + ) + ), + ) + http_mocker.get( + HttpRequest(url="https://api.test.com/parent/2/items?start=2024-07-13&end=2024-07-15"), + HttpResponse( + body=json.dumps( + { + "data": [{"id": 11, "updated_at": "2024-07-13"}], + "has_more": False + } + ) + ), + ) + + state = [ + AirbyteStateMessage( + type=AirbyteStateType.STREAM, + stream=AirbyteStreamState( + stream_descriptor=StreamDescriptor(name="TestStream", namespace=None), + stream_state=AirbyteStateBlob(updated_at="2024-07-13"), + ), + ) + ] + source = ConcurrentDeclarativeSource( + source_config=_MANIFEST, config=_CONFIG, catalog=None, state=state + ) + configured_catalog = create_configured_catalog(source, _CONFIG) + + # Test incremental data retrieval (with state) + incremental_records = get_records(source, _CONFIG, configured_catalog, state) + expected_incremental = [ + {"id": 10, "updated_at": "2024-07-13"}, + {"id": 11, "updated_at": "2024-07-13"}, + ] + assert expected_incremental == incremental_records + From 4f03269746a5bdc0ebc9046d1dcbf521cee8e8a5 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Thu, 13 Mar 2025 10:42:03 +0000 Subject: [PATCH 13/25] Auto-fix lint and format issues --- .../retrievers/test_lazy_simple_retriever.py | 73 ++++++++++--------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/unit_tests/sources/declarative/retrievers/test_lazy_simple_retriever.py b/unit_tests/sources/declarative/retrievers/test_lazy_simple_retriever.py index 32f353400..d4d48eac5 100644 --- a/unit_tests/sources/declarative/retrievers/test_lazy_simple_retriever.py +++ b/unit_tests/sources/declarative/retrievers/test_lazy_simple_retriever.py @@ -64,13 +64,13 @@ "page_token_option": { "type": "RequestOption", "inject_into": "request_parameter", - "field_name": "starting_after" + "field_name": "starting_after", }, "pagination_strategy": { "type": "CursorPagination", "cursor_value": '{{ response["data"][-1]["id"] }}', - "stop_condition": '{{ not response.get("has_more", False) }}' - } + "stop_condition": '{{ not response.get("has_more", False) }}', + }, }, "partition_router": { "type": "SubstreamPartitionRouter", @@ -218,37 +218,37 @@ def test_retriever_with_lazy_reading(): body=json.dumps( { "data": [ - {"id": 1, "name": "parent_1", "updated_at": "2024-07-13", "items": {"data": [{"id": 1}, {"id": 2}], "has_more": True}}, - {"id": 2, "name": "parent_2", "updated_at": "2024-07-13", "items": {"data": [{"id": 3}, {"id": 4}], "has_more": False}}, + { + "id": 1, + "name": "parent_1", + "updated_at": "2024-07-13", + "items": {"data": [{"id": 1}, {"id": 2}], "has_more": True}, + }, + { + "id": 2, + "name": "parent_2", + "updated_at": "2024-07-13", + "items": {"data": [{"id": 3}, {"id": 4}], "has_more": False}, + }, ], - "has_more": False + "has_more": False, } ) ), ) http_mocker.get( - HttpRequest(url="https://api.test.com/parent/1/items?starting_after=2&start=2024-07-01&end=2024-07-15"), - HttpResponse( - body=json.dumps( - { - "data": [{"id": 5}, {"id": 6}], - "has_more": True - } - ) + HttpRequest( + url="https://api.test.com/parent/1/items?starting_after=2&start=2024-07-01&end=2024-07-15" ), + HttpResponse(body=json.dumps({"data": [{"id": 5}, {"id": 6}], "has_more": True})), ) http_mocker.get( - HttpRequest(url="https://api.test.com/parent/1/items?starting_after=6&start=2024-07-01&end=2024-07-15"), - HttpResponse( - body=json.dumps( - { - "data": [{"id": 7}], - "has_more": False - } - ) + HttpRequest( + url="https://api.test.com/parent/1/items?starting_after=6&start=2024-07-01&end=2024-07-15" ), + HttpResponse(body=json.dumps({"data": [{"id": 7}], "has_more": False})), ) source = ConcurrentDeclarativeSource( @@ -280,12 +280,20 @@ def test_incremental_sync_with_state(): body=json.dumps( { "data": [ - {"id": 1, "name": "parent_1", "updated_at": "2024-07-13", - "items": {"data": [{"id": 1}, {"id": 2}], "has_more": False}}, - {"id": 2, "name": "parent_2", "updated_at": "2024-07-13", - "items": {"data": [{"id": 3}, {"id": 4}], "has_more": False}}, + { + "id": 1, + "name": "parent_1", + "updated_at": "2024-07-13", + "items": {"data": [{"id": 1}, {"id": 2}], "has_more": False}, + }, + { + "id": 2, + "name": "parent_2", + "updated_at": "2024-07-13", + "items": {"data": [{"id": 3}, {"id": 4}], "has_more": False}, + }, ], - "has_more": False + "has_more": False, } ) ), @@ -295,10 +303,7 @@ def test_incremental_sync_with_state(): HttpRequest(url="https://api.test.com/parent/1/items?start=2024-07-13&end=2024-07-15"), HttpResponse( body=json.dumps( - { - "data": [{"id": 10, "updated_at": "2024-07-13"}], - "has_more": False - } + {"data": [{"id": 10, "updated_at": "2024-07-13"}], "has_more": False} ) ), ) @@ -306,10 +311,7 @@ def test_incremental_sync_with_state(): HttpRequest(url="https://api.test.com/parent/2/items?start=2024-07-13&end=2024-07-15"), HttpResponse( body=json.dumps( - { - "data": [{"id": 11, "updated_at": "2024-07-13"}], - "has_more": False - } + {"data": [{"id": 11, "updated_at": "2024-07-13"}], "has_more": False} ) ), ) @@ -335,4 +337,3 @@ def test_incremental_sync_with_state(): {"id": 11, "updated_at": "2024-07-13"}, ] assert expected_incremental == incremental_records - From f19d048f80531c5fc0809769e5c5f687202a8270 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Thu, 13 Mar 2025 13:43:38 +0100 Subject: [PATCH 14/25] Add extra_fields support --- .../retrievers/simple_retriever.py | 33 +++++++++++++------ 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py index 17623a70c..23ce80155 100644 --- a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +++ b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py @@ -358,7 +358,7 @@ def _fetch_next_page( # This logic is similar to _read_pages in the HttpStream class. When making changes here, consider making changes there as well. def _read_pages( self, - records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]], + records_generator_fn: Callable[[Optional[requests.Response], Optional[StreamSlice]], Iterable[Record]], stream_state: Mapping[str, Any], stream_slice: StreamSlice, ) -> Iterable[Record]: @@ -372,7 +372,7 @@ def _read_pages( last_page_size = 0 last_record: Optional[Record] = None - for record in records_generator_fn(response): + for record in records_generator_fn(response, stream_slice=stream_slice): # type: ignore[call-arg] # only _parse_records expected as a func last_page_size += 1 last_record = record yield record @@ -397,7 +397,7 @@ def _read_pages( def _read_single_page( self, - records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]], + records_generator_fn: Callable[[Optional[requests.Response], Optional[StreamSlice]], Iterable[Record]], stream_state: Mapping[str, Any], stream_slice: StreamSlice, ) -> Iterable[StreamData]: @@ -412,7 +412,7 @@ def _read_single_page( last_page_size = 0 last_record: Optional[Record] = None - for record in records_generator_fn(response): + for record in records_generator_fn(response, stream_slice=stream_slice): # type: ignore[call-arg] # only _parse_records expected as a func last_page_size += 1 last_record = record yield record @@ -456,7 +456,6 @@ def read_records( record_generator = partial( self._parse_records, stream_state=self.state or {}, - stream_slice=_slice, records_schema=records_schema, ) @@ -665,7 +664,7 @@ class LazySimpleRetriever(SimpleRetriever): def _read_pages( self, - records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]], + records_generator_fn: Callable[[Optional[requests.Response], Optional[StreamSlice]], Iterable[Record]], stream_state: Mapping[str, Any], stream_slice: StreamSlice, ) -> Iterable[Record]: @@ -682,6 +681,20 @@ def _read_pages( child_records = self._extract_child_records(parent_record) response = self._create_response(child_records) + if parent_stream_config.extra_fields: + extra_fields = [ + [field_path_part.eval(self.config) for field_path_part in field_path] # type: ignore [union-attr] + for field_path in parent_stream_config.extra_fields + ] + + extracted_extra_fields = self.partition_router._extract_extra_fields(parent_record, extra_fields) + + stream_slice = StreamSlice( + partition=stream_slice.partition, + cursor_slice=stream_slice.cursor_slice, + extra_fields=extracted_extra_fields + ) + yield from self._yield_records_with_pagination( response, records_generator_fn, @@ -717,7 +730,7 @@ def _create_response(self, data: Mapping[str, Any]) -> SafeResponse: def _yield_records_with_pagination( self, response: requests.Response, - records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]], + records_generator_fn: Callable[[Optional[requests.Response], Optional[StreamSlice]], Iterable[Record]], stream_state: Mapping[str, Any], stream_slice: StreamSlice, parent_record: MutableMapping[str, Any], @@ -726,7 +739,7 @@ def _yield_records_with_pagination( """Yield records, handling pagination if needed.""" last_page_size, last_record = 0, None - for record in records_generator_fn(response): + for record in records_generator_fn(response, stream_slice=stream_slice): # type: ignore[call-arg] # only _parse_records expected as a func last_page_size += 1 last_record = record yield record @@ -745,7 +758,7 @@ def _yield_records_with_pagination( def _paginate( self, next_page_token: Any, - records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]], + records_generator_fn: Callable[[Optional[requests.Response], Optional[StreamSlice]], Iterable[Record]], stream_state: Mapping[str, Any], stream_slice: StreamSlice, parent_record: MutableMapping[str, Any], @@ -767,7 +780,7 @@ def _paginate( response = self._fetch_next_page(stream_state, stream_slice, next_page_token) last_page_size, last_record = 0, None - for record in records_generator_fn(response): + for record in records_generator_fn(response, stream_slice=stream_slice): # type: ignore[call-arg] # only _parse_records expected as a func last_page_size += 1 last_record = record yield record From 96646791b2ebc4aab5f4edcc49b1f5d386c8ab64 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Thu, 13 Mar 2025 15:01:55 +0100 Subject: [PATCH 15/25] Fix child extraction --- .../declarative/parsers/model_to_component_factory.py | 8 ++++++++ .../sources/declarative/retrievers/simple_retriever.py | 6 +----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index d9f79e77b..efdae3408 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2754,6 +2754,14 @@ def create_simple_retriever( "LazySimpleRetriever only supports 'SubstreamPartitionRouterModel' as the 'partition_router' type. " # type: ignore[union-attr] # model.partition_router has BaseModel type f"Found: '{model.partition_router.type}'." ) + lazy_read_pointer = [] + for i, path in enumerate(model.lazy_read_pointer): + if path == "*": + raise ValueError( + f"'lazy_read_pointer' support only direct pointing. Found: '* as a {i} element in the pointer.'" + ) + + lazy_read_pointer.append(InterpolatedString.create(path, parameters=model.parameters or {})) lazy_read_pointer = [ InterpolatedString.create(path, parameters=model.parameters or {}) diff --git a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py index 23ce80155..3c1f25120 100644 --- a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +++ b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py @@ -714,11 +714,7 @@ def _extract_child_records( return parent_record path = [path.eval(self.config) for path in self.lazy_read_pointer] - return ( - dpath.values(parent_record, path) # type: ignore # return value will be a MutableMapping, given input data structure - if "*" in path - else dpath.get(parent_record, path, default=[]) - ) + return dpath.get(parent_record, path, default=[]) def _create_response(self, data: Mapping[str, Any]) -> SafeResponse: """Create a SafeResponse with the given data.""" From 2f31d73e74fffe6dda180b446fb6d4b99e116769 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Thu, 13 Mar 2025 18:05:40 +0100 Subject: [PATCH 16/25] Refactor lazy read --- .../declarative_component_schema.yaml | 18 +-- .../models/declarative_component_schema.py | 94 +++++++----- .../parsers/model_to_component_factory.py | 44 ++---- .../substream_partition_router.py | 121 ++++++++------- .../retrievers/simple_retriever.py | 144 ++++-------------- .../retrievers/test_lazy_simple_retriever.py | 35 +++-- 6 files changed, 185 insertions(+), 271 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 25b121ed6..346742a29 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -2873,6 +2873,15 @@ definitions: type: type: string enum: [ParentStreamConfig] + lazy_read_pointer: + title: Lazy Read Pointer + description: If set, this will enable lazy reading, using the initial read of parent records to extract child records. + type: array + default: [ ] + items: + - type: string + interpolation_context: + - config parent_key: title: Parent Key description: The primary key of records from the parent stream that will be used during the retrieval of records for the current substream. This parent identifier field is typically a characteristic of the child records being extracted from the source API. @@ -3242,15 +3251,6 @@ definitions: - "$ref": "#/definitions/IterableDecoder" - "$ref": "#/definitions/XmlDecoder" - "$ref": "#/definitions/ZipfileDecoder" - lazy_read_pointer: - title: Lazy Read Pointer - description: If set, this will enable lazy reading, using the initial read of parent records to extract child records. - type: array - default: [ ] - items: - - type: string - interpolation_context: - - config $parameters: type: object additionalProperties: true diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 7e25e0c24..aa32aa3bb 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -609,7 +609,9 @@ class OAuthAuthenticator(BaseModel): scopes: Optional[List[str]] = Field( None, description="List of scopes that should be granted to the access token.", - examples=[["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"]], + examples=[ + ["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"] + ], title="Scopes", ) token_expiry_date: Optional[str] = Field( @@ -1078,24 +1080,28 @@ class OAuthConfigSpecification(BaseModel): class Config: extra = Extra.allow - oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = Field( - None, - description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", - examples=[ - {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, - { - "app_id": { - "type": "string", - "path_in_connector_config": ["info", "app_id"], - } - }, - ], - title="OAuth user input", + oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = ( + Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", + examples=[ + {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, + { + "app_id": { + "type": "string", + "path_in_connector_config": ["info", "app_id"], + } + }, + ], + title="OAuth user input", + ) ) - oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = Field( - None, - description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{{my_var}}`.\n- The nested resolution variables like `{{ {{my_nested_var}} }}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {{ {{my_var_a}}:{{my_var_b}} | base64Encoder }}\n + base64Decorer - decode from `base64` encoded string, {{ {{my_string_variable_or_string_value}} | base64Decoder }}\n + urlEncoder - encode the input string to URL-like format, {{ https://test.host.com/endpoint | urlEncoder}}\n + urlDecorer - decode the input url-encoded string into text format, {{ urlDecoder:https%3A%2F%2Fairbyte.io | urlDecoder}}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {{ {{state_value}} | codeChallengeS256 }}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{{client_id_key}}={{client_id_value}}&{{redirect_uri_key}}={{ {{redirect_uri_value}} | urlEncoder}}&{{state_key}}={{state_value}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{{ auth_code_key }}": "{{ auth_code_value }}",\n "{{ client_id_key }}": "{{ client_id_value }}",\n "{{ client_secret_key }}": "{{ client_secret_value }}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', - title="DeclarativeOAuth Connector Specification", + oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = ( + Field( + None, + description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{{my_var}}`.\n- The nested resolution variables like `{{ {{my_nested_var}} }}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {{ {{my_var_a}}:{{my_var_b}} | base64Encoder }}\n + base64Decorer - decode from `base64` encoded string, {{ {{my_string_variable_or_string_value}} | base64Decoder }}\n + urlEncoder - encode the input string to URL-like format, {{ https://test.host.com/endpoint | urlEncoder}}\n + urlDecorer - decode the input url-encoded string into text format, {{ urlDecoder:https%3A%2F%2Fairbyte.io | urlDecoder}}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {{ {{state_value}} | codeChallengeS256 }}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{{client_id_key}}={{client_id_value}}&{{redirect_uri_key}}={{ {{redirect_uri_value}} | urlEncoder}}&{{state_key}}={{state_value}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{{ auth_code_key }}": "{{ auth_code_value }}",\n "{{ client_id_key }}": "{{ client_id_value }}",\n "{{ client_secret_key }}": "{{ client_secret_value }}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', + title="DeclarativeOAuth Connector Specification", + ) ) complete_oauth_output_specification: Optional[Dict[str, Any]] = Field( None, @@ -1113,7 +1119,9 @@ class Config: complete_oauth_server_input_specification: Optional[Dict[str, Any]] = Field( None, description="OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations.\nMust be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the\nserver when completing an OAuth flow (typically exchanging an auth code for refresh token).\nExamples:\n complete_oauth_server_input_specification={\n client_id: {\n type: string\n },\n client_secret: {\n type: string\n }\n }", - examples=[{"client_id": {"type": "string"}, "client_secret": {"type": "string"}}], + examples=[ + {"client_id": {"type": "string"}, "client_secret": {"type": "string"}} + ], title="OAuth input specification", ) complete_oauth_server_output_specification: Optional[Dict[str, Any]] = Field( @@ -1766,7 +1774,9 @@ class RecordSelector(BaseModel): description="Responsible for filtering records to be emitted by the Source.", title="Record Filter", ) - schema_normalization: Optional[Union[SchemaNormalization, CustomSchemaNormalization]] = Field( + schema_normalization: Optional[ + Union[SchemaNormalization, CustomSchemaNormalization] + ] = Field( SchemaNormalization.None_, description="Responsible for normalization according to the schema.", title="Schema Normalization", @@ -1987,7 +1997,9 @@ class Config: description="Component used to fetch data incrementally based on a time field in the data.", title="Incremental Sync", ) - name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") + name: Optional[str] = Field( + "", description="The stream name.", example=["Users"], title="Name" + ) primary_key: Optional[PrimaryKey] = Field( "", description="The primary key of the stream.", title="Primary Key" ) @@ -2205,6 +2217,11 @@ class DynamicSchemaLoader(BaseModel): class ParentStreamConfig(BaseModel): type: Literal["ParentStreamConfig"] + lazy_read_pointer: Optional[List[str]] = Field( + [], + description="If set, this will enable lazy reading, using the initial read of parent records to extract child records.", + title="Lazy Read Pointer", + ) parent_key: str = Field( ..., description="The primary key of records from the parent stream that will be used during the retrieval of records for the current substream. This parent identifier field is typically a characteristic of the child records being extracted from the source API.", @@ -2240,7 +2257,9 @@ class ParentStreamConfig(BaseModel): class StateDelegatingStream(BaseModel): type: Literal["StateDelegatingStream"] - name: str = Field(..., description="The stream name.", example=["Users"], title="Name") + name: str = Field( + ..., description="The stream name.", example=["Users"], title="Name" + ) full_refresh_stream: DeclarativeStream = Field( ..., description="Component used to coordinate how records are extracted across stream slices and request pages when the state is empty or not provided.", @@ -2277,7 +2296,11 @@ class SimpleRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], + List[ + Union[ + CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter + ] + ], ] ] = Field( [], @@ -2300,11 +2323,6 @@ class SimpleRetriever(BaseModel): description="Component decoding the response so records can be extracted.", title="Decoder", ) - lazy_read_pointer: Optional[List[str]] = Field( - [], - description="If set, this will enable lazy reading, using the initial read of parent records to extract child records.", - title="Lazy Read Pointer", - ) parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") @@ -2326,7 +2344,9 @@ class AsyncRetriever(BaseModel): ) download_extractor: Optional[ Union[CustomRecordExtractor, DpathExtractor, ResponseToFileExtractor] - ] = Field(None, description="Responsible for fetching the records from provided urls.") + ] = Field( + None, description="Responsible for fetching the records from provided urls." + ) creation_requester: Union[CustomRequester, HttpRequester] = Field( ..., description="Requester component that describes how to prepare HTTP requests to send to the source API to create the async server-side job.", @@ -2360,7 +2380,11 @@ class AsyncRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], + List[ + Union[ + CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter + ] + ], ] ] = Field( [], @@ -2428,10 +2452,12 @@ class DynamicDeclarativeStream(BaseModel): stream_template: DeclarativeStream = Field( ..., description="Reference to the stream template.", title="Stream Template" ) - components_resolver: Union[HttpComponentsResolver, ConfigComponentsResolver] = Field( - ..., - description="Component resolve and populates stream templates with components values.", - title="Components Resolver", + components_resolver: Union[HttpComponentsResolver, ConfigComponentsResolver] = ( + Field( + ..., + description="Component resolve and populates stream templates with components values.", + title="Components Resolver", + ) ) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index efdae3408..53f62e762 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2528,6 +2528,10 @@ def create_parent_stream_config( if model.request_option else None ) + + if "*" in model.lazy_read_pointer: + raise ValueError("The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed.") + return ParentStreamConfig( parent_key=model.parent_key, request_option=request_option, @@ -2537,6 +2541,7 @@ def create_parent_stream_config( incremental_dependency=model.incremental_dependency or False, parameters=model.parameters or {}, extra_fields=model.extra_fields, + lazy_read_pointer=model.lazy_read_pointer ) @staticmethod @@ -2740,42 +2745,13 @@ def create_simple_retriever( model.ignore_stream_slicer_parameters_on_paginated_requests or False ) - if model.lazy_read_pointer and not hasattr(model, "partition_router"): - raise ValueError( - "LazySimpleRetriever requires a 'partition_router' when 'lazy_read_pointer' is set. " - "Please either define 'partition_router' or remove 'lazy_read_pointer' from the model." - ) + if hasattr(model, "partition_router") and model.partition_router and model.partition_router.type == "SubstreamPartitionRouter" and not bool( + self._connector_state_manager.get_stream_state(name, None)) and any(parent_stream_config.lazy_read_pointer for parent_stream_config in model.partition_router.parent_stream_configs): - if model.lazy_read_pointer and not bool( - self._connector_state_manager.get_stream_state(name, None) - ): - if model.partition_router.type != "SubstreamPartitionRouter": # type: ignore[union-attr] # model.partition_router has BaseModel type + if incremental_sync.step or incremental_sync.cursor_granularity: raise ValueError( - "LazySimpleRetriever only supports 'SubstreamPartitionRouterModel' as the 'partition_router' type. " # type: ignore[union-attr] # model.partition_router has BaseModel type - f"Found: '{model.partition_router.type}'." + f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." ) - lazy_read_pointer = [] - for i, path in enumerate(model.lazy_read_pointer): - if path == "*": - raise ValueError( - f"'lazy_read_pointer' support only direct pointing. Found: '* as a {i} element in the pointer.'" - ) - - lazy_read_pointer.append(InterpolatedString.create(path, parameters=model.parameters or {})) - - lazy_read_pointer = [ - InterpolatedString.create(path, parameters=model.parameters or {}) - for path in model.lazy_read_pointer - ] - partition_router = self._create_component_from_model( - model=model.partition_router, # type: ignore[arg-type] # model.partition_router has BaseModel type - config=config, # type: ignore[arg-type] - ) - stream_slicer = ( - self._create_component_from_model(model=incremental_sync, config=config) - if incremental_sync - else SinglePartitionRouter(parameters={}) - ) return LazySimpleRetriever( name=name, @@ -2789,8 +2765,6 @@ def create_simple_retriever( config=config, ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests, parameters=model.parameters or {}, - partition_router=partition_router, - lazy_read_pointer=lazy_read_pointer, ) if self._limit_slices_fetched or self._emit_connector_builder_messages: diff --git a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py index 2a58a25b8..118a40c2b 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py @@ -3,22 +3,14 @@ # +import json import copy import logging from dataclasses import InitVar, dataclass -from typing import ( - TYPE_CHECKING, - Any, - Iterable, - List, - Mapping, - MutableMapping, - Optional, - Tuple, - Union, -) +from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union import dpath +import requests from airbyte_cdk.models import AirbyteMessage from airbyte_cdk.models import Type as MessageType @@ -58,6 +50,7 @@ class ParentStreamConfig: ) request_option: Optional[RequestOption] = None incremental_dependency: bool = False + lazy_read_pointer: Optional[List[Union[InterpolatedString, str]]] = None def __post_init__(self, parameters: Mapping[str, Any]) -> None: self.parent_key = InterpolatedString.create(self.parent_key, parameters=parameters) @@ -71,6 +64,12 @@ def __post_init__(self, parameters: Mapping[str, Any]) -> None: for key_path in self.extra_fields ] + self.lazy_read_pointer = [ + InterpolatedString.create(path, parameters=parameters) + if isinstance(path, str) + else path for path in self.lazy_read_pointer + ] if self.lazy_read_pointer else None + @dataclass class SubstreamPartitionRouter(PartitionRouter): @@ -143,42 +142,6 @@ def _get_request_option( parent_config.request_option.inject_into_request(params, value, self.config) return params - def process_parent_record( - self, - parent_record: Union[AirbyteMessage, Record, Mapping[str, Any]], - parent_stream_name: str, - ) -> Tuple[Optional[MutableMapping[str, Any]], Optional[MutableMapping[str, Any]]]: - """ - Processes and extracts data from a parent record, handling different record types - and ensuring only valid types proceed. - - :param parent_record: The parent record to process. - :param parent_stream_name: The parent stream name associated with the record. - :return: Extracted record data and partition (if applicable). - :raises AirbyteTracedException: If the record type is invalid. - """ - if isinstance(parent_record, AirbyteMessage): - self.logger.warning( - f"Parent stream {parent_stream_name} returns records of type AirbyteMessage. " - f"This SubstreamPartitionRouter is not able to checkpoint incremental parent state." - ) - if parent_record.type == MessageType.RECORD: - return parent_record.record.data, {} # type: ignore[union-attr] # parent_record.record is always AirbyteRecordMessage - return None, None # Skip invalid or non-record data - - if isinstance(parent_record, Record): - parent_partition = ( - parent_record.associated_slice.partition if parent_record.associated_slice else {} - ) - return {**parent_record.data}, {**parent_partition} - - if isinstance(parent_record, Mapping): - return {**parent_record}, {} - - raise AirbyteTracedException( - message=f"Parent stream returned records as invalid type {type(parent_record)}" - ) - def stream_slices(self) -> Iterable[StreamSlice]: """ Iterate over each parent stream's record and create a StreamSlice for each record. @@ -210,16 +173,29 @@ def stream_slices(self) -> Iterable[StreamSlice]: # read_stateless() assumes the parent is not concurrent. This is currently okay since the concurrent CDK does # not support either substreams or RFR, but something that needs to be considered once we do - for raw_parent_record in parent_stream.read_only_records(): - # Process the parent record - parent_record, parent_partition = self.process_parent_record( - raw_parent_record, parent_stream.name - ) - - # Skip invalid or non-record data - if parent_record is None: - continue - + for parent_record in parent_stream.read_only_records(): + parent_partition = None + # Skip non-records (eg AirbyteLogMessage) + if isinstance(parent_record, AirbyteMessage): + self.logger.warning( + f"Parent stream {parent_stream.name} returns records of type AirbyteMessage. This SubstreamPartitionRouter is not able to checkpoint incremental parent state." + ) + if parent_record.type == MessageType.RECORD: + parent_record = parent_record.record.data # type: ignore[union-attr, assignment] # record is always a Record + else: + continue + elif isinstance(parent_record, Record): + parent_partition = ( + parent_record.associated_slice.partition + if parent_record.associated_slice + else {} + ) + parent_record = parent_record.data + elif not isinstance(parent_record, Mapping): + # The parent_record should only take the form of a Record, AirbyteMessage, or Mapping. Anything else is invalid + raise AirbyteTracedException( + message=f"Parent stream returned records as invalid type {type(parent_record)}" + ) try: partition_value = dpath.get( parent_record, # type: ignore [arg-type] @@ -231,6 +207,9 @@ def stream_slices(self) -> Iterable[StreamSlice]: # Add extra fields extracted_extra_fields = self._extract_extra_fields(parent_record, extra_fields) + if parent_stream_config.lazy_read_pointer: + extracted_extra_fields = {"child_response": self._extract_child_response(parent_record, parent_stream_config.lazy_read_pointer), **extracted_extra_fields} + yield StreamSlice( partition={ partition_field: partition_value, @@ -240,6 +219,21 @@ def stream_slices(self) -> Iterable[StreamSlice]: extra_fields=extracted_extra_fields, ) + def _extract_child_response( + self, parent_record: MutableMapping[str, Any], pointer + ) -> requests.Response: + """Extract child records from a parent record based on lazy pointers.""" + + def _create_response(data: Mapping[str, Any]) -> SafeResponse: + """Create a SafeResponse with the given data.""" + response = SafeResponse() + response.content = json.dumps(data).encode("utf-8") + response.status_code = 200 + return response + + path = [path.eval(self.config) for path in pointer] + return _create_response(dpath.get(parent_record, path, default=[])) + def _extract_extra_fields( self, parent_record: Mapping[str, Any] | AirbyteMessage, @@ -416,3 +410,16 @@ def get_stream_state(self) -> Optional[Mapping[str, StreamState]]: @property def logger(self) -> logging.Logger: return logging.getLogger("airbyte.SubstreamPartitionRouter") + + +class SafeResponse(requests.Response): + def __getattr__(self, name: str) -> Any: + return getattr(requests.Response, name, None) + + @property + def content(self) -> Optional[bytes]: + return super().content + + @content.setter + def content(self, value: Union[str, bytes]) -> None: + self._content = value.encode() if isinstance(value, str) else value diff --git a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py index 3c1f25120..bbf014d8e 100644 --- a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +++ b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py @@ -12,14 +12,12 @@ Iterable, List, Mapping, - MutableMapping, Optional, Set, Tuple, Union, ) -import dpath import requests from typing_extensions import deprecated @@ -31,9 +29,6 @@ from airbyte_cdk.sources.declarative.partition_routers.single_partition_router import ( SinglePartitionRouter, ) -from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( - SubstreamPartitionRouter, -) from airbyte_cdk.sources.declarative.requesters.paginators.no_pagination import NoPagination from airbyte_cdk.sources.declarative.requesters.paginators.paginator import Paginator from airbyte_cdk.sources.declarative.requesters.request_options import ( @@ -358,7 +353,7 @@ def _fetch_next_page( # This logic is similar to _read_pages in the HttpStream class. When making changes here, consider making changes there as well. def _read_pages( self, - records_generator_fn: Callable[[Optional[requests.Response], Optional[StreamSlice]], Iterable[Record]], + records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]], stream_state: Mapping[str, Any], stream_slice: StreamSlice, ) -> Iterable[Record]: @@ -372,7 +367,7 @@ def _read_pages( last_page_size = 0 last_record: Optional[Record] = None - for record in records_generator_fn(response, stream_slice=stream_slice): # type: ignore[call-arg] # only _parse_records expected as a func + for record in records_generator_fn(response): last_page_size += 1 last_record = record yield record @@ -397,7 +392,7 @@ def _read_pages( def _read_single_page( self, - records_generator_fn: Callable[[Optional[requests.Response], Optional[StreamSlice]], Iterable[Record]], + records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]], stream_state: Mapping[str, Any], stream_slice: StreamSlice, ) -> Iterable[StreamData]: @@ -412,7 +407,7 @@ def _read_single_page( last_page_size = 0 last_record: Optional[Record] = None - for record in records_generator_fn(response, stream_slice=stream_slice): # type: ignore[call-arg] # only _parse_records expected as a func + for record in records_generator_fn(response): last_page_size += 1 last_record = record yield record @@ -455,6 +450,7 @@ def read_records( most_recent_record_from_slice = None record_generator = partial( self._parse_records, + stream_slice=stream_slice, stream_state=self.state or {}, records_schema=records_schema, ) @@ -636,19 +632,6 @@ def _fetch_next_page( ) -class SafeResponse(requests.Response): - def __getattr__(self, name: str) -> Any: - return getattr(requests.Response, name, None) - - @property - def content(self) -> Optional[bytes]: - return super().content - - @content.setter - def content(self, value: Union[str, bytes]) -> None: - self._content = value.encode() if isinstance(value, str) else value - - @deprecated( "This class is experimental. Use at your own risk.", category=ExperimentalClassWarning, @@ -659,124 +642,49 @@ class LazySimpleRetriever(SimpleRetriever): A retriever that supports lazy loading from parent streams. """ - partition_router: SubstreamPartitionRouter = field(init=True, repr=False, default=None) # type: ignore[assignment] # 'partition_router' is required for LazySimpleRetriever and is validated in the constructor - lazy_read_pointer: Optional[List[InterpolatedString]] = None - def _read_pages( self, - records_generator_fn: Callable[[Optional[requests.Response], Optional[StreamSlice]], Iterable[Record]], + records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]], stream_state: Mapping[str, Any], stream_slice: StreamSlice, ) -> Iterable[Record]: - parent_stream_config = self.partition_router.parent_stream_configs[-1] - parent_stream = parent_stream_config.stream - - for raw_parent_record in parent_stream.read_only_records(): - parent_record, parent_partition = self.partition_router.process_parent_record( - raw_parent_record, parent_stream.name - ) - if parent_record is None: - continue - - child_records = self._extract_child_records(parent_record) - response = self._create_response(child_records) - - if parent_stream_config.extra_fields: - extra_fields = [ - [field_path_part.eval(self.config) for field_path_part in field_path] # type: ignore [union-attr] - for field_path in parent_stream_config.extra_fields - ] - - extracted_extra_fields = self.partition_router._extract_extra_fields(parent_record, extra_fields) + response = stream_slice.extra_fields["child_response"] + print(f"LOL, {response.json()}") + if response: + last_page_size, last_record = 0, None + for record in records_generator_fn(response): # type: ignore[call-arg] # only _parse_records expected as a func + last_page_size += 1 + last_record = record + yield record - stream_slice = StreamSlice( - partition=stream_slice.partition, - cursor_slice=stream_slice.cursor_slice, - extra_fields=extracted_extra_fields + next_page_token = self._next_page_token(response, last_page_size, last_record, None) + if next_page_token: + yield from self._paginate( + next_page_token, + records_generator_fn, + stream_state, + stream_slice, ) - yield from self._yield_records_with_pagination( - response, - records_generator_fn, - stream_state, - stream_slice, - parent_record, - parent_stream_config, - ) - - yield from [] - - def _extract_child_records( - self, parent_record: MutableMapping[str, Any] - ) -> MutableMapping[str, Any]: - """Extract child records from a parent record based on lazy pointers.""" - if not self.lazy_read_pointer: - return parent_record - - path = [path.eval(self.config) for path in self.lazy_read_pointer] - return dpath.get(parent_record, path, default=[]) - - def _create_response(self, data: Mapping[str, Any]) -> SafeResponse: - """Create a SafeResponse with the given data.""" - response = SafeResponse() - response.content = json.dumps(data).encode("utf-8") - response.status_code = 200 - return response - - def _yield_records_with_pagination( - self, - response: requests.Response, - records_generator_fn: Callable[[Optional[requests.Response], Optional[StreamSlice]], Iterable[Record]], - stream_state: Mapping[str, Any], - stream_slice: StreamSlice, - parent_record: MutableMapping[str, Any], - parent_stream_config: Any, - ) -> Iterable[Record]: - """Yield records, handling pagination if needed.""" - last_page_size, last_record = 0, None - - for record in records_generator_fn(response, stream_slice=stream_slice): # type: ignore[call-arg] # only _parse_records expected as a func - last_page_size += 1 - last_record = record - yield record - - next_page_token = self._next_page_token(response, last_page_size, last_record, None) - if next_page_token: - yield from self._paginate( - next_page_token, - records_generator_fn, - stream_state, - stream_slice, - parent_record, - parent_stream_config, - ) + yield from [] + else: + yield from self._read_pages(records_generator_fn, stream_state, stream_slice) def _paginate( self, next_page_token: Any, - records_generator_fn: Callable[[Optional[requests.Response], Optional[StreamSlice]], Iterable[Record]], + records_generator_fn: Callable[[Optional[requests.Response]], Iterable[Record]], stream_state: Mapping[str, Any], stream_slice: StreamSlice, - parent_record: MutableMapping[str, Any], - parent_stream_config: Any, ) -> Iterable[Record]: """Handle pagination by fetching subsequent pages.""" - partition_field = parent_stream_config.partition_field.eval(self.config) - partition_value = dpath.get( - parent_record, parent_stream_config.parent_key.eval(self.config) - ) - stream_slice = StreamSlice( - partition={partition_field: partition_value, "parent_slice": {}}, - cursor_slice=stream_slice.cursor_slice, - ) - pagination_complete = False while not pagination_complete: response = self._fetch_next_page(stream_state, stream_slice, next_page_token) last_page_size, last_record = 0, None - for record in records_generator_fn(response, stream_slice=stream_slice): # type: ignore[call-arg] # only _parse_records expected as a func + for record in records_generator_fn(response): # type: ignore[call-arg] # only _parse_records expected as a func last_page_size += 1 last_record = record yield record diff --git a/unit_tests/sources/declarative/retrievers/test_lazy_simple_retriever.py b/unit_tests/sources/declarative/retrievers/test_lazy_simple_retriever.py index d4d48eac5..8a905cb68 100644 --- a/unit_tests/sources/declarative/retrievers/test_lazy_simple_retriever.py +++ b/unit_tests/sources/declarative/retrievers/test_lazy_simple_retriever.py @@ -43,7 +43,6 @@ }, "retriever": { "type": "SimpleRetriever", - "lazy_read_pointer": ["items"], "requester": { "type": "HttpRequester", "url_base": "https://api.test.com", @@ -79,6 +78,7 @@ "type": "ParentStreamConfig", "parent_key": "id", "partition_field": "parent_id", + "lazy_read_pointer": ["items"], "stream": { "type": "DeclarativeStream", "name": "parent", @@ -124,8 +124,6 @@ "end_datetime": {"datetime": "{{ now_utc().strftime('%Y-%m-%d') }}"}, "datetime_format": "%Y-%m-%d", "cursor_datetime_formats": ["%Y-%m-%d", "%Y-%m-%dT%H:%M:%S"], - "cursor_granularity": "P1D", - "step": "P15D", "cursor_field": "updated_at", "start_time_option": { "type": "RequestOption", @@ -222,13 +220,13 @@ def test_retriever_with_lazy_reading(): "id": 1, "name": "parent_1", "updated_at": "2024-07-13", - "items": {"data": [{"id": 1}, {"id": 2}], "has_more": True}, + "items": {"data": [{"id": 1, "updated_at": "2024-07-13"}, {"id": 2, "updated_at": "2024-07-13"}], "has_more": True}, }, { "id": 2, "name": "parent_2", "updated_at": "2024-07-13", - "items": {"data": [{"id": 3}, {"id": 4}], "has_more": False}, + "items": {"data": [{"id": 3, "updated_at": "2024-07-13"}, {"id": 4, "updated_at": "2024-07-13"}], "has_more": False}, }, ], "has_more": False, @@ -241,14 +239,14 @@ def test_retriever_with_lazy_reading(): HttpRequest( url="https://api.test.com/parent/1/items?starting_after=2&start=2024-07-01&end=2024-07-15" ), - HttpResponse(body=json.dumps({"data": [{"id": 5}, {"id": 6}], "has_more": True})), + HttpResponse(body=json.dumps({"data": [{"id": 5, "updated_at": "2024-07-13"}, {"id": 6, "updated_at": "2024-07-13"}], "has_more": True})), ) http_mocker.get( HttpRequest( url="https://api.test.com/parent/1/items?starting_after=6&start=2024-07-01&end=2024-07-15" ), - HttpResponse(body=json.dumps({"data": [{"id": 7}], "has_more": False})), + HttpResponse(body=json.dumps({"data": [{"id": 7, "updated_at": "2024-07-13"}], "has_more": False})), ) source = ConcurrentDeclarativeSource( @@ -259,15 +257,16 @@ def test_retriever_with_lazy_reading(): # Test full data retrieval (without state) full_records = get_records(source, _CONFIG, configured_catalog) expected_full = [ - {"id": 1}, - {"id": 2}, - {"id": 5}, - {"id": 6}, - {"id": 7}, - {"id": 3}, - {"id": 4}, + {"id": 1, "updated_at": "2024-07-13"}, + {"id": 2, "updated_at": "2024-07-13"}, + {"id": 3, "updated_at": "2024-07-13"}, + {"id": 4, "updated_at": "2024-07-13"}, + {"id": 5, "updated_at": "2024-07-13"}, + {"id": 6, "updated_at": "2024-07-13"}, + {"id": 7, "updated_at": "2024-07-13"}, ] - assert expected_full == full_records + + assert all(record in expected_full for record in full_records) @freezegun.freeze_time("2024-07-15") @@ -284,13 +283,13 @@ def test_incremental_sync_with_state(): "id": 1, "name": "parent_1", "updated_at": "2024-07-13", - "items": {"data": [{"id": 1}, {"id": 2}], "has_more": False}, + "items": {"data": [{"id": 1, "updated_at": "2024-07-13"}, {"id": 2, "updated_at": "2024-07-13"}], "has_more": False}, }, { "id": 2, "name": "parent_2", "updated_at": "2024-07-13", - "items": {"data": [{"id": 3}, {"id": 4}], "has_more": False}, + "items": {"data": [{"id": 3, "updated_at": "2024-07-13"}, {"id": 4, "updated_at": "2024-07-13"}], "has_more": False}, }, ], "has_more": False, @@ -336,4 +335,4 @@ def test_incremental_sync_with_state(): {"id": 10, "updated_at": "2024-07-13"}, {"id": 11, "updated_at": "2024-07-13"}, ] - assert expected_incremental == incremental_records + assert all(record in expected_incremental for record in incremental_records) From b278dcdcb4cba69de0a9386db91e200461220276 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Thu, 13 Mar 2025 18:37:07 +0000 Subject: [PATCH 17/25] Auto-fix lint and format issues --- .../models/declarative_component_schema.py | 84 +++++++------------ .../parsers/model_to_component_factory.py | 19 +++-- .../substream_partition_router.py | 26 ++++-- .../retrievers/test_lazy_simple_retriever.py | 50 +++++++++-- 4 files changed, 105 insertions(+), 74 deletions(-) diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index aa32aa3bb..6436842be 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -609,9 +609,7 @@ class OAuthAuthenticator(BaseModel): scopes: Optional[List[str]] = Field( None, description="List of scopes that should be granted to the access token.", - examples=[ - ["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"] - ], + examples=[["crm.list.read", "crm.objects.contacts.read", "crm.schema.contacts.read"]], title="Scopes", ) token_expiry_date: Optional[str] = Field( @@ -1080,28 +1078,24 @@ class OAuthConfigSpecification(BaseModel): class Config: extra = Extra.allow - oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = ( - Field( - None, - description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", - examples=[ - {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, - { - "app_id": { - "type": "string", - "path_in_connector_config": ["info", "app_id"], - } - }, - ], - title="OAuth user input", - ) + oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = Field( + None, + description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\nExamples:\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", + examples=[ + {"app_id": {"type": "string", "path_in_connector_config": ["app_id"]}}, + { + "app_id": { + "type": "string", + "path_in_connector_config": ["info", "app_id"], + } + }, + ], + title="OAuth user input", ) - oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = ( - Field( - None, - description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{{my_var}}`.\n- The nested resolution variables like `{{ {{my_nested_var}} }}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {{ {{my_var_a}}:{{my_var_b}} | base64Encoder }}\n + base64Decorer - decode from `base64` encoded string, {{ {{my_string_variable_or_string_value}} | base64Decoder }}\n + urlEncoder - encode the input string to URL-like format, {{ https://test.host.com/endpoint | urlEncoder}}\n + urlDecorer - decode the input url-encoded string into text format, {{ urlDecoder:https%3A%2F%2Fairbyte.io | urlDecoder}}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {{ {{state_value}} | codeChallengeS256 }}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{{client_id_key}}={{client_id_value}}&{{redirect_uri_key}}={{ {{redirect_uri_value}} | urlEncoder}}&{{state_key}}={{state_value}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{{ auth_code_key }}": "{{ auth_code_value }}",\n "{{ client_id_key }}": "{{ client_id_value }}",\n "{{ client_secret_key }}": "{{ client_secret_value }}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', - title="DeclarativeOAuth Connector Specification", - ) + oauth_connector_input_specification: Optional[OauthConnectorInputSpecification] = Field( + None, + description='The DeclarativeOAuth specific blob.\nPertains to the fields defined by the connector relating to the OAuth flow.\n\nInterpolation capabilities:\n- The variables placeholders are declared as `{{my_var}}`.\n- The nested resolution variables like `{{ {{my_nested_var}} }}` is allowed as well.\n\n- The allowed interpolation context is:\n + base64Encoder - encode to `base64`, {{ {{my_var_a}}:{{my_var_b}} | base64Encoder }}\n + base64Decorer - decode from `base64` encoded string, {{ {{my_string_variable_or_string_value}} | base64Decoder }}\n + urlEncoder - encode the input string to URL-like format, {{ https://test.host.com/endpoint | urlEncoder}}\n + urlDecorer - decode the input url-encoded string into text format, {{ urlDecoder:https%3A%2F%2Fairbyte.io | urlDecoder}}\n + codeChallengeS256 - get the `codeChallenge` encoded value to provide additional data-provider specific authorisation values, {{ {{state_value}} | codeChallengeS256 }}\n\nExamples:\n - The TikTok Marketing DeclarativeOAuth spec:\n {\n "oauth_connector_input_specification": {\n "type": "object",\n "additionalProperties": false,\n "properties": {\n "consent_url": "https://ads.tiktok.com/marketing_api/auth?{{client_id_key}}={{client_id_value}}&{{redirect_uri_key}}={{ {{redirect_uri_value}} | urlEncoder}}&{{state_key}}={{state_value}}",\n "access_token_url": "https://business-api.tiktok.com/open_api/v1.3/oauth2/access_token/",\n "access_token_params": {\n "{{ auth_code_key }}": "{{ auth_code_value }}",\n "{{ client_id_key }}": "{{ client_id_value }}",\n "{{ client_secret_key }}": "{{ client_secret_value }}"\n },\n "access_token_headers": {\n "Content-Type": "application/json",\n "Accept": "application/json"\n },\n "extract_output": ["data.access_token"],\n "client_id_key": "app_id",\n "client_secret_key": "secret",\n "auth_code_key": "auth_code"\n }\n }\n }', + title="DeclarativeOAuth Connector Specification", ) complete_oauth_output_specification: Optional[Dict[str, Any]] = Field( None, @@ -1119,9 +1113,7 @@ class Config: complete_oauth_server_input_specification: Optional[Dict[str, Any]] = Field( None, description="OAuth specific blob. This is a Json Schema used to validate Json configurations persisted as Airbyte Server configurations.\nMust be a valid non-nested JSON describing additional fields configured by the Airbyte Instance or Workspace Admins to be used by the\nserver when completing an OAuth flow (typically exchanging an auth code for refresh token).\nExamples:\n complete_oauth_server_input_specification={\n client_id: {\n type: string\n },\n client_secret: {\n type: string\n }\n }", - examples=[ - {"client_id": {"type": "string"}, "client_secret": {"type": "string"}} - ], + examples=[{"client_id": {"type": "string"}, "client_secret": {"type": "string"}}], title="OAuth input specification", ) complete_oauth_server_output_specification: Optional[Dict[str, Any]] = Field( @@ -1774,9 +1766,7 @@ class RecordSelector(BaseModel): description="Responsible for filtering records to be emitted by the Source.", title="Record Filter", ) - schema_normalization: Optional[ - Union[SchemaNormalization, CustomSchemaNormalization] - ] = Field( + schema_normalization: Optional[Union[SchemaNormalization, CustomSchemaNormalization]] = Field( SchemaNormalization.None_, description="Responsible for normalization according to the schema.", title="Schema Normalization", @@ -1997,9 +1987,7 @@ class Config: description="Component used to fetch data incrementally based on a time field in the data.", title="Incremental Sync", ) - name: Optional[str] = Field( - "", description="The stream name.", example=["Users"], title="Name" - ) + name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") primary_key: Optional[PrimaryKey] = Field( "", description="The primary key of the stream.", title="Primary Key" ) @@ -2257,9 +2245,7 @@ class ParentStreamConfig(BaseModel): class StateDelegatingStream(BaseModel): type: Literal["StateDelegatingStream"] - name: str = Field( - ..., description="The stream name.", example=["Users"], title="Name" - ) + name: str = Field(..., description="The stream name.", example=["Users"], title="Name") full_refresh_stream: DeclarativeStream = Field( ..., description="Component used to coordinate how records are extracted across stream slices and request pages when the state is empty or not provided.", @@ -2296,11 +2282,7 @@ class SimpleRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[ - Union[ - CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter - ] - ], + List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], ] ] = Field( [], @@ -2344,9 +2326,7 @@ class AsyncRetriever(BaseModel): ) download_extractor: Optional[ Union[CustomRecordExtractor, DpathExtractor, ResponseToFileExtractor] - ] = Field( - None, description="Responsible for fetching the records from provided urls." - ) + ] = Field(None, description="Responsible for fetching the records from provided urls.") creation_requester: Union[CustomRequester, HttpRequester] = Field( ..., description="Requester component that describes how to prepare HTTP requests to send to the source API to create the async server-side job.", @@ -2380,11 +2360,7 @@ class AsyncRetriever(BaseModel): CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter, - List[ - Union[ - CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter - ] - ], + List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]], ] ] = Field( [], @@ -2452,12 +2428,10 @@ class DynamicDeclarativeStream(BaseModel): stream_template: DeclarativeStream = Field( ..., description="Reference to the stream template.", title="Stream Template" ) - components_resolver: Union[HttpComponentsResolver, ConfigComponentsResolver] = ( - Field( - ..., - description="Component resolve and populates stream templates with components values.", - title="Components Resolver", - ) + components_resolver: Union[HttpComponentsResolver, ConfigComponentsResolver] = Field( + ..., + description="Component resolve and populates stream templates with components values.", + title="Components Resolver", ) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 53f62e762..7d73b0259 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2530,7 +2530,9 @@ def create_parent_stream_config( ) if "*" in model.lazy_read_pointer: - raise ValueError("The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed.") + raise ValueError( + "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." + ) return ParentStreamConfig( parent_key=model.parent_key, @@ -2541,7 +2543,7 @@ def create_parent_stream_config( incremental_dependency=model.incremental_dependency or False, parameters=model.parameters or {}, extra_fields=model.extra_fields, - lazy_read_pointer=model.lazy_read_pointer + lazy_read_pointer=model.lazy_read_pointer, ) @staticmethod @@ -2745,9 +2747,16 @@ def create_simple_retriever( model.ignore_stream_slicer_parameters_on_paginated_requests or False ) - if hasattr(model, "partition_router") and model.partition_router and model.partition_router.type == "SubstreamPartitionRouter" and not bool( - self._connector_state_manager.get_stream_state(name, None)) and any(parent_stream_config.lazy_read_pointer for parent_stream_config in model.partition_router.parent_stream_configs): - + if ( + hasattr(model, "partition_router") + and model.partition_router + and model.partition_router.type == "SubstreamPartitionRouter" + and not bool(self._connector_state_manager.get_stream_state(name, None)) + and any( + parent_stream_config.lazy_read_pointer + for parent_stream_config in model.partition_router.parent_stream_configs + ) + ): if incremental_sync.step or incremental_sync.cursor_granularity: raise ValueError( f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." diff --git a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py index 118a40c2b..cd37ce93b 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py @@ -3,8 +3,8 @@ # -import json import copy +import json import logging from dataclasses import InitVar, dataclass from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union @@ -64,11 +64,16 @@ def __post_init__(self, parameters: Mapping[str, Any]) -> None: for key_path in self.extra_fields ] - self.lazy_read_pointer = [ - InterpolatedString.create(path, parameters=parameters) - if isinstance(path, str) - else path for path in self.lazy_read_pointer - ] if self.lazy_read_pointer else None + self.lazy_read_pointer = ( + [ + InterpolatedString.create(path, parameters=parameters) + if isinstance(path, str) + else path + for path in self.lazy_read_pointer + ] + if self.lazy_read_pointer + else None + ) @dataclass @@ -208,7 +213,12 @@ def stream_slices(self) -> Iterable[StreamSlice]: extracted_extra_fields = self._extract_extra_fields(parent_record, extra_fields) if parent_stream_config.lazy_read_pointer: - extracted_extra_fields = {"child_response": self._extract_child_response(parent_record, parent_stream_config.lazy_read_pointer), **extracted_extra_fields} + extracted_extra_fields = { + "child_response": self._extract_child_response( + parent_record, parent_stream_config.lazy_read_pointer + ), + **extracted_extra_fields, + } yield StreamSlice( partition={ @@ -220,7 +230,7 @@ def stream_slices(self) -> Iterable[StreamSlice]: ) def _extract_child_response( - self, parent_record: MutableMapping[str, Any], pointer + self, parent_record: MutableMapping[str, Any], pointer ) -> requests.Response: """Extract child records from a parent record based on lazy pointers.""" diff --git a/unit_tests/sources/declarative/retrievers/test_lazy_simple_retriever.py b/unit_tests/sources/declarative/retrievers/test_lazy_simple_retriever.py index 8a905cb68..d98f733dc 100644 --- a/unit_tests/sources/declarative/retrievers/test_lazy_simple_retriever.py +++ b/unit_tests/sources/declarative/retrievers/test_lazy_simple_retriever.py @@ -220,13 +220,25 @@ def test_retriever_with_lazy_reading(): "id": 1, "name": "parent_1", "updated_at": "2024-07-13", - "items": {"data": [{"id": 1, "updated_at": "2024-07-13"}, {"id": 2, "updated_at": "2024-07-13"}], "has_more": True}, + "items": { + "data": [ + {"id": 1, "updated_at": "2024-07-13"}, + {"id": 2, "updated_at": "2024-07-13"}, + ], + "has_more": True, + }, }, { "id": 2, "name": "parent_2", "updated_at": "2024-07-13", - "items": {"data": [{"id": 3, "updated_at": "2024-07-13"}, {"id": 4, "updated_at": "2024-07-13"}], "has_more": False}, + "items": { + "data": [ + {"id": 3, "updated_at": "2024-07-13"}, + {"id": 4, "updated_at": "2024-07-13"}, + ], + "has_more": False, + }, }, ], "has_more": False, @@ -239,14 +251,28 @@ def test_retriever_with_lazy_reading(): HttpRequest( url="https://api.test.com/parent/1/items?starting_after=2&start=2024-07-01&end=2024-07-15" ), - HttpResponse(body=json.dumps({"data": [{"id": 5, "updated_at": "2024-07-13"}, {"id": 6, "updated_at": "2024-07-13"}], "has_more": True})), + HttpResponse( + body=json.dumps( + { + "data": [ + {"id": 5, "updated_at": "2024-07-13"}, + {"id": 6, "updated_at": "2024-07-13"}, + ], + "has_more": True, + } + ) + ), ) http_mocker.get( HttpRequest( url="https://api.test.com/parent/1/items?starting_after=6&start=2024-07-01&end=2024-07-15" ), - HttpResponse(body=json.dumps({"data": [{"id": 7, "updated_at": "2024-07-13"}], "has_more": False})), + HttpResponse( + body=json.dumps( + {"data": [{"id": 7, "updated_at": "2024-07-13"}], "has_more": False} + ) + ), ) source = ConcurrentDeclarativeSource( @@ -283,13 +309,25 @@ def test_incremental_sync_with_state(): "id": 1, "name": "parent_1", "updated_at": "2024-07-13", - "items": {"data": [{"id": 1, "updated_at": "2024-07-13"}, {"id": 2, "updated_at": "2024-07-13"}], "has_more": False}, + "items": { + "data": [ + {"id": 1, "updated_at": "2024-07-13"}, + {"id": 2, "updated_at": "2024-07-13"}, + ], + "has_more": False, + }, }, { "id": 2, "name": "parent_2", "updated_at": "2024-07-13", - "items": {"data": [{"id": 3, "updated_at": "2024-07-13"}, {"id": 4, "updated_at": "2024-07-13"}], "has_more": False}, + "items": { + "data": [ + {"id": 3, "updated_at": "2024-07-13"}, + {"id": 4, "updated_at": "2024-07-13"}, + ], + "has_more": False, + }, }, ], "has_more": False, From b06800f59cc6aa22538846fa0127abbf01c007e5 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Thu, 13 Mar 2025 19:50:21 +0100 Subject: [PATCH 18/25] Update some conditions --- .../sources/declarative/parsers/model_to_component_factory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 7d73b0259..ce5fb5e9d 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2529,7 +2529,7 @@ def create_parent_stream_config( else None ) - if "*" in model.lazy_read_pointer: + if model.lazy_read_pointer and any("*" in pointer for pointer in model.lazy_read_pointer): raise ValueError( "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." ) @@ -2757,7 +2757,7 @@ def create_simple_retriever( for parent_stream_config in model.partition_router.parent_stream_configs ) ): - if incremental_sync.step or incremental_sync.cursor_granularity: + if incremental_sync and (incremental_sync.step or incremental_sync.cursor_granularity): raise ValueError( f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." ) From 11533d8e18b7fb59cd4a0dd20b9ad70155552253 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Thu, 13 Mar 2025 20:15:47 +0100 Subject: [PATCH 19/25] Fix mypy --- .../parsers/model_to_component_factory.py | 28 ++++++++++++++----- .../substream_partition_router.py | 8 +++--- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index ce5fb5e9d..37c43b9f2 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2534,6 +2534,10 @@ def create_parent_stream_config( "The '*' wildcard in 'lazy_read_pointer' is not supported — only direct paths are allowed." ) + model_lazy_read_pointer: List[Union[InterpolatedString, str]] = ( + [x for x in model.lazy_read_pointer] if model.lazy_read_pointer else [] + ) + return ParentStreamConfig( parent_key=model.parent_key, request_option=request_option, @@ -2543,7 +2547,7 @@ def create_parent_stream_config( incremental_dependency=model.incremental_dependency or False, parameters=model.parameters or {}, extra_fields=model.extra_fields, - lazy_read_pointer=model.lazy_read_pointer, + lazy_read_pointer=model_lazy_read_pointer, ) @staticmethod @@ -2748,18 +2752,28 @@ def create_simple_retriever( ) if ( - hasattr(model, "partition_router") - and model.partition_router - and model.partition_router.type == "SubstreamPartitionRouter" + model.partition_router + and model.partition_router.type == "SubstreamPartitionRouter" # type: ignore[union-attr] # 'model' is SimpleRetrieverModel and not bool(self._connector_state_manager.get_stream_state(name, None)) and any( parent_stream_config.lazy_read_pointer - for parent_stream_config in model.partition_router.parent_stream_configs + for parent_stream_config in model.partition_router.parent_stream_configs # type: ignore[union-attr] # partition_router type guaranteed by a condition earlier ) ): - if incremental_sync and (incremental_sync.step or incremental_sync.cursor_granularity): + if incremental_sync: + if incremental_sync.type != "DatetimeBasedCursor": + raise ValueError( + f"LazySimpleRetriever only supports DatetimeBasedCursor. Found: {incremental_sync.type}." + ) + + elif incremental_sync.step or incremental_sync.cursor_granularity: + raise ValueError( + f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." + ) + + if model.decoder and model.decoder.type != "JsonDecoder": raise ValueError( - f"Found more that one slice per parent. LazySimpleRetriever only supports single slice read for stream - {name}." + f"LazySimpleRetriever only supports JsonDecoder. Found: {model.decoder.type}." ) return LazySimpleRetriever( diff --git a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py index cd37ce93b..03c98f125 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py @@ -215,7 +215,7 @@ def stream_slices(self) -> Iterable[StreamSlice]: if parent_stream_config.lazy_read_pointer: extracted_extra_fields = { "child_response": self._extract_child_response( - parent_record, parent_stream_config.lazy_read_pointer + parent_record, parent_stream_config.lazy_read_pointer # type: ignore[arg-type] # lazy_read_pointer type handeled in __post_init__ of parent_stream_config ), **extracted_extra_fields, } @@ -230,11 +230,11 @@ def stream_slices(self) -> Iterable[StreamSlice]: ) def _extract_child_response( - self, parent_record: MutableMapping[str, Any], pointer + self, parent_record: Mapping[str, Any] | AirbyteMessage, pointer: List[InterpolatedString] ) -> requests.Response: """Extract child records from a parent record based on lazy pointers.""" - def _create_response(data: Mapping[str, Any]) -> SafeResponse: + def _create_response(data: MutableMapping[str, Any]) -> SafeResponse: """Create a SafeResponse with the given data.""" response = SafeResponse() response.content = json.dumps(data).encode("utf-8") @@ -242,7 +242,7 @@ def _create_response(data: Mapping[str, Any]) -> SafeResponse: return response path = [path.eval(self.config) for path in pointer] - return _create_response(dpath.get(parent_record, path, default=[])) + return _create_response(dpath.get(parent_record, path, default=[])) # type: ignore # argunet will be a MutableMapping, given input data structure def _extract_extra_fields( self, From 1a10bb63fa64cf3ecde2d1b31a4073fcb0c0ac9d Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Thu, 13 Mar 2025 19:18:30 +0000 Subject: [PATCH 20/25] Auto-fix lint and format issues --- .../partition_routers/substream_partition_router.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py index 03c98f125..51d654c5b 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py @@ -215,7 +215,8 @@ def stream_slices(self) -> Iterable[StreamSlice]: if parent_stream_config.lazy_read_pointer: extracted_extra_fields = { "child_response": self._extract_child_response( - parent_record, parent_stream_config.lazy_read_pointer # type: ignore[arg-type] # lazy_read_pointer type handeled in __post_init__ of parent_stream_config + parent_record, + parent_stream_config.lazy_read_pointer, # type: ignore[arg-type] # lazy_read_pointer type handeled in __post_init__ of parent_stream_config ), **extracted_extra_fields, } From c90342081586b2c154f6a6bd05f35317057f2a7c Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Thu, 13 Mar 2025 20:28:38 +0100 Subject: [PATCH 21/25] Fix unittest --- .../sources/declarative/parsers/model_to_component_factory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 37c43b9f2..c5f0ae04c 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2753,11 +2753,11 @@ def create_simple_retriever( if ( model.partition_router - and model.partition_router.type == "SubstreamPartitionRouter" # type: ignore[union-attr] # 'model' is SimpleRetrieverModel + and isinstance(model.partition_router, SubstreamPartitionRouterModel) and not bool(self._connector_state_manager.get_stream_state(name, None)) and any( parent_stream_config.lazy_read_pointer - for parent_stream_config in model.partition_router.parent_stream_configs # type: ignore[union-attr] # partition_router type guaranteed by a condition earlier + for parent_stream_config in model.partition_router.parent_stream_configs ) ): if incremental_sync: From 4687726ba489083415331967756fc54965912fb2 Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Thu, 13 Mar 2025 23:11:10 +0100 Subject: [PATCH 22/25] Fix typo --- .../sources/declarative/parsers/model_to_component_factory.py | 2 +- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index c5f0ae04c..79557ec5b 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -1927,7 +1927,7 @@ def _merge_stream_slicers( # we could support here by calling create_concurrent_cursor_from_perpartition_cursor raise ValueError("Per partition state is not supported yet for AsyncRetriever.") - stream_slicer = self._build_stream_slicer_from_partition_router(retriever_model, config) + stream_slicer = self._build_stream_slicer_from_partition_router(retriever_model, config, stream_name=model.name) if model.incremental_sync: return self._build_incremental_cursor(model, stream_slicer, config) diff --git a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py index bbf014d8e..65aa5d406 100644 --- a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +++ b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py @@ -649,7 +649,6 @@ def _read_pages( stream_slice: StreamSlice, ) -> Iterable[Record]: response = stream_slice.extra_fields["child_response"] - print(f"LOL, {response.json()}") if response: last_page_size, last_record = 0, None for record in records_generator_fn(response): # type: ignore[call-arg] # only _parse_records expected as a func From 3d31e271c26476d65ffec5ee804c5464e57bb939 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Thu, 13 Mar 2025 22:17:42 +0000 Subject: [PATCH 23/25] Auto-fix lint and format issues --- .../sources/declarative/parsers/model_to_component_factory.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index ed6b4736a..86e880b20 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -1908,7 +1908,9 @@ def _merge_stream_slicers( ) -> Optional[StreamSlicer]: retriever_model = model.retriever - stream_slicer = self._build_stream_slicer_from_partition_router(retriever_model, config, stream_name=model.name) + stream_slicer = self._build_stream_slicer_from_partition_router( + retriever_model, config, stream_name=model.name + ) if retriever_model.type == "AsyncRetriever": is_not_datetime_cursor = ( From 08296623410967df16c3ce59b88e43db0863217c Mon Sep 17 00:00:00 2001 From: Serhii Lazebnyi Date: Thu, 13 Mar 2025 23:24:17 +0100 Subject: [PATCH 24/25] Add docs to SafeResponse --- .../partition_routers/substream_partition_router.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py index 51d654c5b..50275b984 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py @@ -424,6 +424,11 @@ def logger(self) -> logging.Logger: class SafeResponse(requests.Response): + """ + A subclass of requests.Response that acts as an interface to migrate parsed child records + into a response object. This allows seamless interaction with child records as if they + were original response, ensuring compatibility with methods that expect requests.Response data type. + """ def __getattr__(self, name: str) -> Any: return getattr(requests.Response, name, None) From ad521a05ef6e8e7b69f2565876291f3c422b4b23 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Thu, 13 Mar 2025 22:25:48 +0000 Subject: [PATCH 25/25] Auto-fix lint and format issues --- .../declarative/partition_routers/substream_partition_router.py | 1 + 1 file changed, 1 insertion(+) diff --git a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py index 9a41286ff..50fe4f0bc 100644 --- a/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +++ b/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py @@ -424,6 +424,7 @@ class SafeResponse(requests.Response): into a response object. This allows seamless interaction with child records as if they were original response, ensuring compatibility with methods that expect requests.Response data type. """ + def __getattr__(self, name: str) -> Any: return getattr(requests.Response, name, None)