Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: (CDK) (Manifest) - Deduplicate common components to shared + ref #447

Draft
wants to merge 17 commits into
base: main
Choose a base branch
from

Conversation

bazarnov
Copy link
Contributor

@bazarnov bazarnov commented Mar 26, 2025

What

Resolves:

How

TODO

User Impact

No impact is expected, this is not a breaking change.

@lmossman
Copy link
Contributor

lmossman commented Mar 28, 2025

@bazarnov as discussed on our call, here are some examples of input YAML manifest and expected output:

First input
version: 6.41.5

type: DeclarativeSource

check:
  type: CheckStream
  stream_names:
    - pokemon

definitions:
  streams:
    pokemon:
      type: DeclarativeStream
      name: pokemon
      retriever:
        type: SimpleRetriever
        decoder:
          type: JsonDecoder
        requester:
          $ref: "#/definitions/base_requester"
          path: pokemon
          http_method: GET
        record_selector:
          type: RecordSelector
          extractor:
            type: DpathExtractor
            field_path: []
      schema_loader:
        type: InlineSchemaLoader
        schema:
          $ref: "#/schemas/pokemon"
    location:
      type: DeclarativeStream
      name: location
      retriever:
        type: SimpleRetriever
        decoder:
          type: JsonDecoder
        requester:
          $ref: "#/definitions/base_requester"
          path: location
          http_method: GET
        record_selector:
          type: RecordSelector
          extractor:
            type: DpathExtractor
            field_path: []
      schema_loader:
        type: InlineSchemaLoader
        schema:
          $ref: "#/schemas/location"
  base_requester:
    type: HttpRequester
    url_base: https://pokeapi.co/api/v2/

streams:
  - $ref: "#/definitions/streams/pokemon"
  - $ref: "#/definitions/streams/location"

spec:
  type: Spec
  connection_specification:
    type: object
    $schema: http://json-schema.org/draft-07/schema#
    required: []
    properties: {}
    additionalProperties: true

metadata:
  assist: {}
  testedStreams:
    pokemon:
      hasRecords: true
      streamHash: 6f6304126c7d27b92d6f753eaaaea074a205e096
      hasResponse: true
      primaryKeysAreUnique: true
      primaryKeysArePresent: true
      responsesAreSuccessful: true
    location:
      streamHash: null
  autoImportSchema:
    pokemon: false
    location: true

schemas:
  pokemon:
    type: object
    $schema: http://json-schema.org/schema#
    properties:
      url:
        type: string
      name:
        type: string
    additionalProperties: true
  location:
    type: object
    $schema: http://json-schema.org/draft-07/schema#
    properties: {}
    additionalProperties: true
First output
{
    "manifest": {
        "spec": {
            "type": "Spec",
            "connection_specification": {
                "type": "object",
                "$schema": "http://json-schema.org/draft-07/schema#",
                "required": [],
                "properties": {},
                "additionalProperties": true
            }
        },
        "type": "DeclarativeSource",
        "check": {
            "type": "CheckStream",
            "stream_names": [
                "pokemon"
            ]
        },
        "schemas": {
            "pokemon": {
                "type": "object",
                "$schema": "http://json-schema.org/schema#",
                "properties": {
                    "url": {
                        "type": "string"
                    },
                    "name": {
                        "type": "string"
                    }
                },
                "additionalProperties": true
            },
            "location": {
                "type": "object",
                "$schema": "http://json-schema.org/draft-07/schema#",
                "properties": {},
                "additionalProperties": true
            }
        },
        "streams": [
            {
                "name": "pokemon",
                "type": "DeclarativeStream",
                "retriever": {
                    "type": "SimpleRetriever",
                    "decoder": {
                        "type": "JsonDecoder"
                    },
                    "requester": {
                        "type": "HttpRequester",
                        "url_base": {
                          "$ref": "#/definitions/shared/HttpRequester/url_base"
                        }
                        "path": "pokemon",
                        "http_method": "GET"
                    },
                    "record_selector": {
                        "type": "RecordSelector",
                        "extractor": {
                            "type": "DpathExtractor",
                            "field_path": []
                        }
                    }
                },
                "schema_loader": {
                    "type": "InlineSchemaLoader",
                    "schema": {
                        "$ref": "#/schemas/pokemon"
                    }
                }
            },
            {
                "name": "location",
                "type": "DeclarativeStream",
                "retriever": {
                    "type": "SimpleRetriever",
                    "decoder": {
                        "type": "JsonDecoder"
                    },
                    "requester": {
                        "type": "HttpRequester",
                        "url_base": {
                          "$ref": "#/definitions/shared/HttpRequester/url_base"
                        }
                        "path": "location",
                        "http_method": "GET"
                    },
                    "record_selector": {
                        "type": "RecordSelector",
                        "extractor": {
                            "type": "DpathExtractor",
                            "field_path": []
                        }
                    }
                },
                "schema_loader": {
                    "type": "InlineSchemaLoader",
                    "schema": {
                        "$ref": "#/schemas/pokemon"
                    }
                }
            }
        ],
        "version": "6.41.5",
        "metadata": {
            "assist": {},
            "testedStreams": {
                "pokemon": {
                    "hasRecords": true,
                    "streamHash": "6f6304126c7d27b92d6f753eaaaea074a205e096",
                    "hasResponse": true,
                    "primaryKeysAreUnique": true,
                    "primaryKeysArePresent": true,
                    "responsesAreSuccessful": true
                },
                "location": {
                    "streamHash": null
                }
            },
            "autoImportSchema": {
                "pokemon": false,
                "location": true
            }
        },
        "definitions": {
          "shared": {
            "HttpRequester": {
              "url_base": "https://pokeapi.co/api/v2/"
            }
          }
        }
    }
}

Second input (in this case there are more occurrences of "someotherapi" than "pokeapi", so "someotherapi" is the one that gets deduped and "pokeapi" is left in a duplicated state)
version: 6.41.5

type: DeclarativeSource

check:
  type: CheckStream
  stream_names:
    - pokemon

streams:
  - type: DeclarativeStream
    name: pokemon
    retriever:
      type: SimpleRetriever
      decoder:
        type: JsonDecoder
      requester:
        type: HttpRequester
        path: pokemon
        url_base: https://pokeapi.co/api/v2/
        http_method: GET
      record_selector:
        type: RecordSelector
        extractor:
          type: DpathExtractor
          field_path: []
    schema_loader:
      type: InlineSchemaLoader
      schema:
        $ref: "#/schemas/pokemon"
  - type: DeclarativeStream
    name: location
    retriever:
      type: SimpleRetriever
      decoder:
        type: JsonDecoder
      requester:
        type: HttpRequester
        path: location
        url_base: https://pokeapi.co/api/v2/
        http_method: GET
      record_selector:
        type: RecordSelector
        extractor:
          type: DpathExtractor
          field_path: []
    schema_loader:
      type: InlineSchemaLoader
      schema:
        $ref: "#/schemas/location"
  - type: DeclarativeStream
    name: somethingElse
    retriever:
      type: SimpleRetriever
      decoder:
        type: JsonDecoder
      requester:
        type: HttpRequester
        path: location
        url_base: https://someotherapi.com
        http_method: GET
      record_selector:
        type: RecordSelector
        extractor:
          type: DpathExtractor
          field_path: []
    schema_loader:
      type: InlineSchemaLoader
      schema:
        $ref: "#/schemas/somethingElse"
  - type: DeclarativeStream
    name: somethingElse2
    retriever:
      type: SimpleRetriever
      decoder:
        type: JsonDecoder
      requester:
        type: HttpRequester
        path: location
        url_base: https://someotherapi.com
        http_method: GET
      record_selector:
        type: RecordSelector
        extractor:
          type: DpathExtractor
          field_path: []
    schema_loader:
      type: InlineSchemaLoader
      schema:
        $ref: "#/schemas/somethingElse2"
  - type: DeclarativeStream
    name: somethingElse3
    retriever:
      type: SimpleRetriever
      decoder:
        type: JsonDecoder
      requester:
        type: HttpRequester
        path: location
        url_base: https://someotherapi.com
        http_method: GET
      record_selector:
        type: RecordSelector
        extractor:
          type: DpathExtractor
          field_path: []
    schema_loader:
      type: InlineSchemaLoader
      schema:
        $ref: "#/schemas/somethingElse3"

spec:
  type: Spec
  connection_specification:
    type: object
    $schema: http://json-schema.org/draft-07/schema#
    required: []
    properties: {}
    additionalProperties: true

metadata:
  assist: {}
  testedStreams:
    pokemon:
      hasRecords: true
      streamHash: 6f6304126c7d27b92d6f753eaaaea074a205e096
      hasResponse: true
      primaryKeysAreUnique: true
      primaryKeysArePresent: true
      responsesAreSuccessful: true
    location:
      streamHash: null
    somethingElse:
      streamHash: fc475d6b671a83c102b6f1b51bad18512f22455f
  autoImportSchema:
    pokemon: false
    location: true

schemas:
  pokemon:
    type: object
    $schema: http://json-schema.org/schema#
    properties:
      url:
        type: string
      name:
        type: string
    additionalProperties: true
  location:
    type: object
    $schema: http://json-schema.org/draft-07/schema#
    properties: {}
    additionalProperties: true
  somethingElse:
    type: object
    $schema: http://json-schema.org/draft-07/schema#
    properties: {}
    additionalProperties: true
  somethingElse2:
    type: object
    $schema: http://json-schema.org/draft-07/schema#
    properties: {}
    additionalProperties: true
  somethingElse3:
    type: object
    $schema: http://json-schema.org/draft-07/schema#
    properties: {}
    additionalProperties: true
Second output
{
    "manifest": {
        "spec": {
            "type": "Spec",
            "connection_specification": {
                "type": "object",
                "$schema": "http://json-schema.org/draft-07/schema#",
                "required": [],
                "properties": {},
                "additionalProperties": true
            }
        },
        "type": "DeclarativeSource",
        "check": {
            "type": "CheckStream",
            "stream_names": [
                "pokemon"
            ]
        },
        "schemas": {
            "pokemon": {
                "type": "object",
                "$schema": "http://json-schema.org/schema#",
                "properties": {
                    "url": {
                        "type": "string"
                    },
                    "name": {
                        "type": "string"
                    }
                },
                "additionalProperties": true
            },
            "location": {
                "type": "object",
                "$schema": "http://json-schema.org/draft-07/schema#",
                "properties": {},
                "additionalProperties": true
            },
            "somethingElse": {
                "type": "object",
                "$schema": "http://json-schema.org/draft-07/schema#",
                "properties": {},
                "additionalProperties": true
            },
            "somethingElse2": {
                "type": "object",
                "$schema": "http://json-schema.org/draft-07/schema#",
                "properties": {},
                "additionalProperties": true
            },
            "somethingElse3": {
                "type": "object",
                "$schema": "http://json-schema.org/draft-07/schema#",
                "properties": {},
                "additionalProperties": true
            }
        },
        "streams": [
            {
                "name": "pokemon",
                "type": "DeclarativeStream",
                "retriever": {
                    "type": "SimpleRetriever",
                    "decoder": {
                        "type": "JsonDecoder"
                    },
                    "requester": {
                        "path": "pokemon",
                        "type": "HttpRequester",
                        "url_base": "https://pokeapi.co/api/v2/",
                        "http_method": "GET"
                    },
                    "record_selector": {
                        "type": "RecordSelector",
                        "extractor": {
                            "type": "DpathExtractor",
                            "field_path": []
                        }
                    }
                },
                "schema_loader": {
                    "type": "InlineSchemaLoader",
                    "schema": {
                        "$ref": "#/schemas/pokemon"
                    }
                }
            },
            {
                "name": "location",
                "type": "DeclarativeStream",
                "retriever": {
                    "type": "SimpleRetriever",
                    "decoder": {
                        "type": "JsonDecoder"
                    },
                    "requester": {
                        "path": "location",
                        "type": "HttpRequester",
                        "url_base": "https://pokeapi.co/api/v2/",
                        "http_method": "GET"
                    },
                    "record_selector": {
                        "type": "RecordSelector",
                        "extractor": {
                            "type": "DpathExtractor",
                            "field_path": []
                        }
                    }
                },
                "schema_loader": {
                    "type": "InlineSchemaLoader",
                    "schema": {
                        "$ref": "#/schemas/location"
                    }
                }
            },
            {
                "name": "somethingElse",
                "type": "DeclarativeStream",
                "retriever": {
                    "type": "SimpleRetriever",
                    "decoder": {
                        "type": "JsonDecoder"
                    },
                    "requester": {
                        "path": "location",
                        "type": "HttpRequester",
                        "url_base": {
                            "$ref": "definitions/shared/HttpRequester/url_base"
                        },
                        "http_method": "GET"
                    },
                    "record_selector": {
                        "type": "RecordSelector",
                        "extractor": {
                            "type": "DpathExtractor",
                            "field_path": []
                        }
                    }
                },
                "schema_loader": {
                    "type": "InlineSchemaLoader",
                    "schema": {
                        "$ref": "#/schemas/somethingElse"
                    }
                }
            },
            {
                "name": "somethingElse2",
                "type": "DeclarativeStream",
                "retriever": {
                    "type": "SimpleRetriever",
                    "decoder": {
                        "type": "JsonDecoder"
                    },
                    "requester": {
                        "path": "location",
                        "type": "HttpRequester",
                        "url_base": {
                            "$ref": "definitions/shared/HttpRequester/url_base"
                        },
                        "http_method": "GET"
                    },
                    "record_selector": {
                        "type": "RecordSelector",
                        "extractor": {
                            "type": "DpathExtractor",
                            "field_path": []
                        }
                    }
                },
                "schema_loader": {
                    "type": "InlineSchemaLoader",
                    "schema": {
                        "$ref": "#/schemas/somethingElse2"
                    }
                }
            },
            {
                "name": "somethingElse3",
                "type": "DeclarativeStream",
                "retriever": {
                    "type": "SimpleRetriever",
                    "decoder": {
                        "type": "JsonDecoder"
                    },
                    "requester": {
                        "path": "location",
                        "type": "HttpRequester",
                        "url_base": {
                            "$ref": "definitions/shared/HttpRequester/url_base"
                        },
                        "http_method": "GET"
                    },
                    "record_selector": {
                        "type": "RecordSelector",
                        "extractor": {
                            "type": "DpathExtractor",
                            "field_path": []
                        }
                    }
                },
                "schema_loader": {
                    "type": "InlineSchemaLoader",
                    "schema": {
                        "$ref": "#/schemas/somethingElse3"
                    }
                }
            }
        ],
        "version": "6.41.5",
        "metadata": {
            "assist": {},
            "testedStreams": {
                "pokemon": {
                    "hasRecords": true,
                    "streamHash": "6f6304126c7d27b92d6f753eaaaea074a205e096",
                    "hasResponse": true,
                    "primaryKeysAreUnique": true,
                    "primaryKeysArePresent": true,
                    "responsesAreSuccessful": true
                },
                "location": {
                    "streamHash": null
                },
                "somethingElse": {
                    "streamHash": "fc475d6b671a83c102b6f1b51bad18512f22455f"
                }
            },
            "autoImportSchema": {
                "pokemon": false,
                "location": true
            }
        }
        "definitions": {
            "shared": {
                "HttpRequester": {
                    url_base: "https://someotherapi.com"
                }
            }
        }
    }
}

And here is an example of if the shareable fields are already being shared:

Input
version: 6.41.5

type: DeclarativeSource

check:
  type: CheckStream
  stream_names:
    - pokemon

definitions: 
  shared:
    HttpRequester:
      url_base: https://pokeapi.co/api/v2/

streams:
  - type: DeclarativeStream
    name: pokemon
    retriever:
      type: SimpleRetriever
      decoder:
        type: JsonDecoder
      requester:
        type: HttpRequester
        path: pokemon
        url_base:
          $ref: "#/definitions/shared/HttpRequester/url_base"
        http_method: GET
      record_selector:
        type: RecordSelector
        extractor:
          type: DpathExtractor
          field_path: []
    schema_loader:
      type: InlineSchemaLoader
      schema:
        $ref: "#/schemas/pokemon"
  - type: DeclarativeStream
    name: location
    retriever:
      type: SimpleRetriever
      decoder:
        type: JsonDecoder
      requester:
        type: HttpRequester
        path: location
        url_base: https://someotherapi.com
        http_method: GET
      record_selector:
        type: RecordSelector
        extractor:
          type: DpathExtractor
          field_path: []
    schema_loader:
      type: InlineSchemaLoader
      schema:
        $ref: "#/schemas/location"
  - type: DeclarativeStream
    name: somethingElse
    retriever:
      type: SimpleRetriever
      decoder:
        type: JsonDecoder
      requester:
        type: HttpRequester
        path: location
        url_base: https://someotherapi.com
        http_method: GET
      record_selector:
        type: RecordSelector
        extractor:
          type: DpathExtractor
          field_path: []
    schema_loader:
      type: InlineSchemaLoader
      schema:
        $ref: "#/schemas/somethingElse"

spec:
  type: Spec
  connection_specification:
    type: object
    $schema: http://json-schema.org/draft-07/schema#
    required: []
    properties: {}
    additionalProperties: true

metadata:
  assist: {}
  testedStreams:
    pokemon:
      hasRecords: true
      streamHash: 6f6304126c7d27b92d6f753eaaaea074a205e096
      hasResponse: true
      primaryKeysAreUnique: true
      primaryKeysArePresent: true
      responsesAreSuccessful: true
    location:
      streamHash: null
    somethingElse:
      streamHash: fc475d6b671a83c102b6f1b51bad18512f22455f
  autoImportSchema:
    pokemon: false
    location: true

schemas:
  pokemon:
    type: object
    $schema: http://json-schema.org/schema#
    properties:
      url:
        type: string
      name:
        type: string
    additionalProperties: true
  location:
    type: object
    $schema: http://json-schema.org/draft-07/schema#
    properties: {}
    additionalProperties: true
  somethingElse:
    type: object
    $schema: http://json-schema.org/draft-07/schema#
    properties: {}
    additionalProperties: true

You can see in the output, we don't try to deduplicate that field in that case, since it is already pointing to shared/HttpRequester/url_base in one of the streams:

Output
{
    "manifest": {
        "spec": {
            "type": "Spec",
            "connection_specification": {
                "type": "object",
                "$schema": "http://json-schema.org/draft-07/schema#",
                "required": [],
                "properties": {},
                "additionalProperties": true
            }
        },
        "type": "DeclarativeSource",
        "check": {
            "type": "CheckStream",
            "stream_names": [
                "pokemon"
            ]
        },
        "schemas": {
            "pokemon": {
                "type": "object",
                "$schema": "http://json-schema.org/schema#",
                "properties": {
                    "url": {
                        "type": "string"
                    },
                    "name": {
                        "type": "string"
                    }
                },
                "additionalProperties": true
            },
            "location": {
                "type": "object",
                "$schema": "http://json-schema.org/draft-07/schema#",
                "properties": {},
                "additionalProperties": true
            },
            "somethingElse": {
                "type": "object",
                "$schema": "http://json-schema.org/draft-07/schema#",
                "properties": {},
                "additionalProperties": true
            }
        },
        "streams": [
            {
                "name": "pokemon",
                "type": "DeclarativeStream",
                "retriever": {
                    "type": "SimpleRetriever",
                    "decoder": {
                        "type": "JsonDecoder"
                    },
                    "requester": {
                        "path": "pokemon",
                        "type": "HttpRequester",
                        "url_base": {
                          "$ref": "#/definitions/shared/HttpRequester/url_base"
                        }
                        "http_method": "GET"
                    },
                    "record_selector": {
                        "type": "RecordSelector",
                        "extractor": {
                            "type": "DpathExtractor",
                            "field_path": []
                        }
                    }
                },
                "schema_loader": {
                    "type": "InlineSchemaLoader",
                    "schema": {
                        "$ref": "#/schemas/pokemon"
                    }
                }
            },
            {
                "name": "location",
                "type": "DeclarativeStream",
                "retriever": {
                    "type": "SimpleRetriever",
                    "decoder": {
                        "type": "JsonDecoder"
                    },
                    "requester": {
                        "path": "location",
                        "type": "HttpRequester",
                        "url_base": "https://someotherapi.com",
                        "http_method": "GET"
                    },
                    "record_selector": {
                        "type": "RecordSelector",
                        "extractor": {
                            "type": "DpathExtractor",
                            "field_path": []
                        }
                    }
                },
                "schema_loader": {
                    "type": "InlineSchemaLoader",
                    "schema": {
                        "$ref": "#/schemas/location"
                    }
                }
            },
            {
                "name": "somethingElse",
                "type": "DeclarativeStream",
                "retriever": {
                    "type": "SimpleRetriever",
                    "decoder": {
                        "type": "JsonDecoder"
                    },
                    "requester": {
                        "path": "location",
                        "type": "HttpRequester",
                        "url_base": "https://someotherapi.com",
                        "http_method": "GET"
                    },
                    "record_selector": {
                        "type": "RecordSelector",
                        "extractor": {
                            "type": "DpathExtractor",
                            "field_path": []
                        }
                    }
                },
                "schema_loader": {
                    "type": "InlineSchemaLoader",
                    "schema": {
                        "$ref": "#/schemas/somethingElse"
                    }
                }
            }
        ],
        "version": "6.41.5",
        "metadata": {
            "assist": {},
            "testedStreams": {
                "pokemon": {
                    "hasRecords": true,
                    "streamHash": "6f6304126c7d27b92d6f753eaaaea074a205e096",
                    "hasResponse": true,
                    "primaryKeysAreUnique": true,
                    "primaryKeysArePresent": true,
                    "responsesAreSuccessful": true
                },
                "location": {
                    "streamHash": null
                },
                "somethingElse": {
                    "streamHash": "fc475d6b671a83c102b6f1b51bad18512f22455f"
                }
            },
            "autoImportSchema": {
                "pokemon": false,
                "location": true
            }
        },
        "definitions": {
            "shared": {
                "HttpRequester": {
                    "url_base": "https://pokeapi.co/api/v2/"
                }
            }
        }
    }
}

@bazarnov
Copy link
Contributor Author

bazarnov commented Mar 31, 2025

@lmossman The PR has been updated according to this comment.

IMPORTANT:

  • this code enables the Manifest Optimization ( deduplication + schemas referencing) for the given manifest only for Connector Builder.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants