-
Notifications
You must be signed in to change notification settings - Fork 67
Open
Description
I got this error, that don't happen locally but only on docker on the ec2:
ERROR: Cannot parse "https://s3..amazonaws.com/" as a URL
But is quite strange because adding a print of the generated file on
['{"s3_bucket_name": "wsi-eus1-pro-datalake-data-s3", "s3_bucket_path": "test", "s3_bucket_region": "eu-central-1", "s3_endpoint": "https://s3.eu-central-1.amazonaws.com", "format": {"flattening": "No flattening", "compression": {"compression_type": "GZIP"}, "format_type": "JSONL"}}']
And looking at the spec of the connector (using a print of the spec command, to make sure was the same destination container):
[AirbyteMessage(type=<Type.LOG: 'LOG'>, log=AirbyteLogMessage(level=<Level.INFO: 'INFO'>, message='INFO main i.m.c.e.DefaultEnvironment(<init>):168 Established active environments: [cli, destination, aws, connector]', stack_trace=None), spec=None, connectionStatus=None, catalog=None, record=None, state=None, trace=None, control=None), AirbyteMessage(type=<Type.LOG: 'LOG'>, log=AirbyteLogMessage(level=<Level.INFO: 'INFO'>, message='INFO main i.a.c.AirbyteConnectorRunnable(run):33 Executing class io.airbyte.cdk.spec.SpecOperation operation.', stack_trace=None), spec=None, connectionStatus=None, catalog=None, record=None, state=None, trace=None, control=None), AirbyteMessage(type=<Type.LOG: 'LOG'>, log=AirbyteLogMessage(level=<Level.WARN: 'WARN'>, message="WARN main c.k.j.j.JsonSchemaGenerator$MyJsonFormatVisitorWrapper$$anon$9(myPropertyHandler):1116 Ignoring property 'format_type' in [simple type, class io.airbyte.cdk.load.command.object_storage.DeprecatedObjectStorageFormatSpecification$Type] since it has already been added, probably as type-property using polymorphism", stack_trace=None), spec=None, connectionStatus=None, catalog=None, record=None, state=None, trace=None, control=None), AirbyteMessage(type=<Type.LOG: 'LOG'>, log=AirbyteLogMessage(level=<Level.WARN:'WARN'>, message="WARN main c.k.j.j.JsonSchemaGenerator$MyJsonFormatVisitorWrapper$$anon$9(myPropertyHandler):1116 Ignoring property 'compression_type' in [simple type, class io.airbyte.cdk.load.command.object_storage.ObjectStorageCompressionSpecification$Type] since it has already been added, probably as type-property using polymorphism", stack_trace=None), spec=None, connectionStatus=None, catalog=None, record=None, state=None, trace=None, control=None), AirbyteMessage(type=<Type.LOG: 'LOG'>, log=AirbyteLogMessage(level=<Level.WARN: 'WARN'>, message="WARN main c.k.j.j.JsonSchemaGenerator$MyJsonFormatVisitorWrapper$$anon$9(myPropertyHandler):1116 Ignoring property 'compression_type' in [simple type, class io.airbyte.cdk.load.command.object_storage.ObjectStorageCompressionSpecification$Type] since it has already been added, probably as type-property using polymorphism", stack_trace=None), spec=None, connectionStatus=None, catalog=None, record=None, state=None, trace=None, control=None), AirbyteMessage(type=<Type.LOG: 'LOG'>, log=AirbyteLogMessage(level=<Level.WARN: 'WARN'>, message="WARN main c.k.j.j.JsonSchemaGenerator$MyJsonFormatVisitorWrapper$$anon$9(myPropertyHandler):1116 Ignoring property 'format_type' in [simple type, class io.airbyte.cdk.load.command.object_storage.DeprecatedObjectStorageFormatSpecification$Type] since it has already been added, probably as type-property using polymorphism", stack_trace=None), spec=None, connectionStatus=None, catalog=None, record=None, state=None, trace=None, control=None), AirbyteMessage(type=<Type.LOG: 'LOG'>, log=AirbyteLogMessage(level=<Level.WARN: 'WARN'>, message="WARN main c.k.j.j.JsonSchemaGenerator$MyJsonFormatVisitorWrapper$$anon$9(myPropertyHandler):1116 Ignoring property 'format_type' in [simple type, class io.airbyte.cdk.load.command.object_storage.DeprecatedObjectStorageFormatSpecification$Type] since it has already been added, probably as type-property using polymorphism", stack_trace=None), spec=None, connectionStatus=None, catalog=None, record=None, state=None, trace=None, control=None), AirbyteMessage(type=<Type.LOG: 'LOG'>, log=AirbyteLogMessage(level=<Level.WARN: 'WARN'>, message="WARN main c.k.j.j.JsonSchemaGenerator$MyJsonFormatVisitorWrapper$$anon$9(myPropertyHandler):1116 Ignoring property 'codec' in [simple type, class io.airbyte.cdk.load.command.avro.AvroFormatCompressionCodecSpecification$Type] since it has already been added, probably as type-property using polymorphism", stack_trace=None), spec=None, connectionStatus=None, catalog=None, record=None, state=None, trace=None, control=None), AirbyteMessage(type=<Type.LOG: 'LOG'>, log=AirbyteLogMessage(level=<Level.WARN: 'WARN'>, message="WARN main c.k.j.j.JsonSchemaGenerator$MyJsonFormatVisitorWrapper$$anon$9(myPropertyHandler):1116 Ignoring property 'codec' in [simple type, class io.airbyte.cdk.load.command.avro.AvroFormatCompressionCodecSpecification$Type] since it has already been added, probably as type-property using polymorphism", stack_trace=None), spec=None, connectionStatus=None, catalog=None, record=None, state=None, trace=None, control=None), AirbyteMessage(type=<Type.LOG: 'LOG'>, log=AirbyteLogMessage(level=<Level.WARN: 'WARN'>, message="WARN main c.k.j.j.JsonSchemaGenerator$MyJsonFormatVisitorWrapper$$anon$9(myPropertyHandler):1116 Ignoring property 'codec' in [simple type, class io.airbyte.cdk.load.command.avro.AvroFormatCompressionCodecSpecification$Type] since it has already been added, probably as type-property using polymorphism", stack_trace=None), spec=None, connectionStatus=None, catalog=None, record=None, state=None, trace=None, control=None), AirbyteMessage(type=<Type.LOG: 'LOG'>, log=AirbyteLogMessage(level=<Level.WARN: 'WARN'>, message="WARN main c.k.j.j.JsonSchemaGenerator$MyJsonFormatVisitorWrapper$$anon$9(myPropertyHandler):1116 Ignoring property 'codec' in [simple type, class io.airbyte.cdk.load.command.avro.AvroFormatCompressionCodecSpecification$Type] since it has already been added, probably as type-property using polymorphism", stack_trace=None), spec=None, connectionStatus=None, catalog=None, record=None, state=None, trace=None, control=None), AirbyteMessage(type=<Type.LOG: 'LOG'>, log=AirbyteLogMessage(level=<Level.WARN: 'WARN'>, message="WARN main c.k.j.j.JsonSchemaGenerator$MyJsonFormatVisitorWrapper$$anon$9(myPropertyHandler):1116 Ignoring property 'codec' in [simple type, class io.airbyte.cdk.load.command.avro.AvroFormatCompressionCodecSpecification$Type] since it has already been added, probably as type-property using polymorphism", stack_trace=None), spec=None, connectionStatus=None, catalog=None, record=None, state=None, trace=None, control=None), AirbyteMessage(type=<Type.LOG: 'LOG'>, log=AirbyteLogMessage(level=<Level.WARN: 'WARN'>, message="WARN main c.k.j.j.JsonSchemaGenerator$MyJsonFormatVisitorWrapper$$anon$9(myPropertyHandler):1116 Ignoring property 'codec' in [simple type, class io.airbyte.cdk.load.command.avro.AvroFormatCompressionCodecSpecification$Type] since it has already been added, probably as type-property using polymorphism", stack_trace=None), spec=None, connectionStatus=None, catalog=None, record=None, state=None, trace=None, control=None), AirbyteMessage(type=<Type.LOG: 'LOG'>, log=AirbyteLogMessage(level=<Level.WARN: 'WARN'>, message="WARN main c.k.j.j.JsonSchemaGenerator$MyJsonFormatVisitorWrapper$$anon$9(myPropertyHandler):1116 Ignoring property 'format_type' in [simple type, class io.airbyte.cdk.load.command.object_storage.DeprecatedObjectStorageFormatSpecification$Type] since it has already been added, probably as type-property using polymorphism", stack_trace=None), spec=None, connectionStatus=None, catalog=None, record=None, state=None, trace=None, control=None), AirbyteMessage(type=<Type.SPEC: 'SPEC'>, log=None, spec=ConnectorSpecification(documentationUrl=AnyUrl('https://docs.airbyte.com/integrations/destinations/s3'), changelogUrl=None, connectionSpecification={'$schema': 'http://json-schema.org/draft-07/schema#', 'title': 'S3 V2 Destination Spec', 'type': 'object', 'additionalProperties': True, 'properties': {'access_key_id': {'type': 'string', 'description': 'The access key ID to access the S3 bucket. Airbyte requires Read and Write permissions to the given bucket. Read more <a href="https://docs.aws.amazon.com/general/latest/gr/aws-sec-cred-types.html#access-keys-and-secret-access-keys">here</a>.', 'title': 'Access Key ID', 'examples': ['A012345678910EXAMPLE'], 'airbyte_secret': True, 'always_show': True, 'order': 0}, 'secret_access_key': {'type': 'string', 'description': 'The corresponding secret to the access key ID. Read more <a href="https://docs.aws.amazon.com/general/latest/gr/aws-sec-cred-types.html#access-keys-and-secret-access-keys">here</a>', 'title': 'Secret Access Key', 'examples': ['a012345678910ABCDEFGH/AbCdEfGhEXAMPLEKEY'], 'airbyte_secret': True, 'always_show': True, 'order': 1}, 'role_arn': {'type': 'string', 'description': 'The ARN of the AWS role to assume. Only usable in Airbyte Cloud.', 'title': 'Role ARN', 'examples': ['arn:aws:iam::123456789:role/ExternalIdIsYourWorkspaceId'], 'order': 2}, 's3_bucket_name': {'type': 'string', 'description': 'The name of the S3 bucket. Read more <a href="https://docs.aws.amazon.com/AmazonS3/latest/userguide/create-bucket-overview.html">here</a>.', 'title': 'S3 Bucket Name', 'examples': ['airbyte_sync'], 'order': 3}, 's3_bucket_path': {'type': 'string', 'description': 'Directory under the S3 bucket where data will be written. Read more <a href="https://docs.airbyte.com/integrations/destinations/s3#:~:text=to%20format%20the-,bucket%20path,-%3A">here</a>', 'title': 'S3 Bucket Path', 'examples': ['data_sync/test'], 'order': 4}, 's3_bucket_region': {'type': 'string', 'enum': ['', 'af-south-1', 'ap-east-1', 'ap-northeast-1', 'ap-northeast-2', 'ap-northeast-3', 'ap-south-1', 'ap-south-2', 'ap-southeast-1', 'ap-southeast-2', 'ap-southeast-3', 'ap-southeast-4', 'ca-central-1', 'ca-west-1', 'cn-north-1', 'cn-northwest-1', 'eu-central-1', 'eu-central-2', 'eu-north-1', 'eu-south-1', 'eu-south-2', 'eu-west-1', 'eu-west-2', 'eu-west-3', 'il-central-1', 'me-central-1', 'me-south-1', 'sa-east-1', 'us-east-1', 'us-east-2', 'us-gov-east-1', 'us-gov-west-1', 'us-west-1', 'us-west-2'], 'description': 'The region of the S3 bucket. See <a href="https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html#concepts-available-regions">here</a> for all region codes.', 'title': 'S3 Bucket Region', 'examples': ['us-east-1'], 'order': 5, 'default': ''}, 'format': {'oneOf': [{'title': 'CSV: Comma-Separated Values', 'type': 'object', 'additionalProperties': True, 'properties': {'format_type': {'type': 'string', 'enum': ['CSV'], 'default': 'CSV'}, 'flattening': {'type': 'string', 'default': 'No flattening', 'enum': ['No flattening', 'Root level flattening'], 'title': 'Flattening'}, 'compression': {'oneOf': [{'title': 'No Compression', 'type': 'object', 'additionalProperties': True, 'properties': {'compression_type': {'type': 'string', 'enum': ['No Compression'], 'default': 'No Compression'}}, 'required': ['compression_type']}, {'title': 'GZIP', 'type': 'object', 'additionalProperties': True, 'properties': {'compression_type': {'type': 'string', 'enum': ['GZIP'], 'default': 'GZIP'}}, 'required': ['compression_type']}], 'description': 'Whether the output files should be compressed. If compression is selected, the output filename will have an extra extension (GZIP: ".jsonl.gz").', 'title': 'Compression', 'type': 'object'}}, 'required': ['format_type', 'flattening']}, {'title': 'JSON Lines: Newline-delimited JSON', 'type': 'object', 'additionalProperties': True, 'properties': {'format_type': {'type': 'string', 'enum': ['JSONL'], 'default': 'JSONL'}, 'flattening': {'type': 'string', 'default': 'No flattening', 'enum': ['No flattening', 'Root level flattening'], 'title': 'Flattening'}, 'compression': {'oneOf': [{'title': 'No Compression', 'type': 'object', 'additionalProperties': True, 'properties': {'compression_type': {'type': 'string', 'enum': ['No Compression'], 'default': 'No Compression'}}, 'required': ['compression_type']}, {'title': 'GZIP', 'type': 'object', 'additionalProperties': True, 'properties': {'compression_type': {'type': 'string', 'enum': ['GZIP'], 'default': 'GZIP'}}, 'required': ['compression_type']}], 'description': 'Whether the output files should be compressed. If compression is selected, the output filename will have an extra extension (GZIP: ".jsonl.gz").', 'title': 'Compression', 'type': 'object'}}, 'required': ['format_type']}, {'title': 'Avro: Apache Avro', 'type': 'object', 'additionalProperties': True, 'properties': {'format_type': {'type': 'string', 'enum': ['Avro'], 'default': 'Avro'}, 'compression_codec': {'oneOf': [{'type': 'object', 'additionalProperties': True, 'properties': {'codec': {'type': 'string', 'enum': ['no compression'], 'default': 'no compression'}}, 'title': 'no compression', 'required': ['codec']}, {'type': 'object', 'additionalProperties': True, 'properties': {'codec': {'type': 'string', 'enum': ['Deflate'], 'default': 'Deflate'}, 'compression_level': {'type': 'integer', 'title': 'Deflate Level'}}, 'title': 'Deflate', 'required': ['codec', 'compression_level']}, {'type': 'object', 'additionalProperties': True, 'properties': {'codec': {'type': 'string', 'enum': ['bzip2'], 'default': 'bzip2'}}, 'title': 'bzip2', 'required': ['codec']}, {'type': 'object', 'additionalProperties': True, 'properties': {'codec': {'type': 'string', 'enum': ['xz'], 'default': 'xz'}, 'compression_level': {'type': 'integer', 'title': 'Compression Level'}}, 'title': 'xz', 'required': ['codec', 'compression_level']}, {'type': 'object', 'additionalProperties': True, 'properties': {'codec': {'type': 'string', 'enum': ['zstandard'], 'default': 'zstandard'}, 'compression_level': {'type': 'integer', 'title': 'Compression Level'}, 'include_checksum': {'type': 'boolean', 'title': 'Include Checksum'}}, 'title': 'zstandard', 'required': ['codec', 'compression_level', 'include_checksum']}, {'type': 'object', 'additionalProperties': True, 'properties': {'codec': {'type': 'string', 'enum': ['snappy'], 'default': 'snappy'}}, 'title': 'snappy', 'required': ['codec']}], 'description': 'The compression algorithm used to compress data. Default to no compression.', 'title': 'Compression Codec', 'order': 1, 'type': 'object'}}, 'required': ['format_type', 'compression_codec']}, {'title': 'Parquet: Columnar Storage', 'type': 'object', 'additionalProperties': True, 'properties': {'format_type': {'type': 'string', 'enum': ['Parquet'], 'default': 'Parquet'}, 'compression_codec': {'type': 'string', 'default': 'UNCOMPRESSED', 'enum': ['UNCOMPRESSED', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4', 'ZSTD'], 'description': 'The compression algorithm used to compress data pages.', 'title': 'Compression Codec'}, 'block_size_mb': {'type': 'integer', 'default': 128, 'description': 'This is the size of a row group being buffered in memory. It limits the memory usage when writing. Larger values will improve the IO when reading, but consume more memory when writing. Default: 128 MB.', 'title': 'Block Size (Row Group Size) (MB)'}, 'max_padding_size_mb': {'type': 'integer', 'default': 8, 'description': 'Maximum size allowed as padding to align row groups. This is also the minimum size of a row group. Default: 8 MB.', 'title': 'Max Padding Size (MB)'}, 'page_size_kb': {'type': 'integer', 'default': 1024, 'description': 'The page size is for compression. A block is composed of pages. A page is the smallest unit that must be read fully to access a single record. If this value is too small, the compression will deteriorate. Default: 1024 KB.', 'title': 'Page Size (KB)'}, 'dictionary_page_size_kb': {'type': 'integer', 'default': 1024, 'description': 'There is one dictionary page per column per row group when dictionary encoding is used. The dictionary page size works like the page size but for dictionary. Default: 1024 KB.', 'title': 'Dictionary Page Size (KB)'}, 'dictionary_encoding': {'type': 'boolean', 'description': 'Default: true.', 'title': 'Dictionary Encoding'}}, 'required': ['format_type']}], 'description': 'Format of the data output. See <a href="https://docs.airbyte.com/integrations/destinations/s3/#supported-output-schema">here</a> for more details', 'title': 'Output Format', 'order': 6, 'type': 'object'}, 's3_endpoint': {'type': 'string', 'description': 'Your S3 endpoint url. Read more <a href="https://docs.aws.amazon.com/general/latest/gr/s3.html#:~:text=Service%20endpoints-,Amazon%20S3%20endpoints,-When%20you%20use">here</a>', 'title': 'S3 Endpoint', 'examples': ['http://localhost:9000'], 'order': 7}, 's3_path_format': {'type': 'string', 'description': 'Format string on how data will be organized inside the bucket directory. Read more <a href="https://docs.airbyte.com/integrations/destinations/s3#:~:text=The%20full%20path%20of%20the%20output%20data%20with%20the%20default%20S3%20path%20format">here</a>', 'title': 'S3 Path Format', 'examples': ['${NAMESPACE}/${STREAM_NAME}/${YEAR}_${MONTH}_${DAY}_${EPOCH}_'], 'order': 8}, 'file_name_pattern': {'type': 'string', 'description': 'Pattern to match file names in the bucket directory. Read more <a href="https://docs.aws.amazon.com/AmazonS3/latest/userguide/ListingKeysUsingAPIs.html">here</a>', 'title': 'File Name Pattern', 'examples': ['{date}', '{date:yyyy_MM}', '{timestamp}', '{part_number}', '{sync_id}'], 'order': 9}}, 'required': ['s3_bucket_name', 's3_bucket_path', 's3_bucket_region', 'format']}, supportsIncremental=True, supportsNormalization=False, supportsDBT=False, supported_destination_sync_modes=[<DestinationSyncMode.overwrite: 'overwrite'>, <DestinationSyncMode.append: 'append'>], advanced_auth=None, protocol_version=None), connectionStatus=None, catalog=None, record=None, state=None, trace=None, control=None), AirbyteMessage(type=<Type.LOG: 'LOG'>, log=AirbyteLogMessage(level=<Level.INFO: 'INFO'>, message='INFO main i.a.c.AirbyteConnectorRunnable(run):46 Flushing output consumer prior to shutdown.', stack_trace=None), spec=None, connectionStatus=None, catalog=None, record=None, state=None, trace=None, control=None), AirbyteMessage(type=<Type.LOG: 'LOG'>, log=AirbyteLogMessage(level=<Level.INFO: 'INFO'>, message='INFO main i.a.c.AirbyteConnectorRunnable(run):48 Completed integration: airbyte/destination-s3.', stack_trace=None), spec=None, connectionStatus=None, catalog=None, record=None, state=None, trace=None, control=None)]
Here the code
import ssl
import environ
import airbyte as ab
env = environ.Env()
environ.Env.read_env()
def run_sftp_to_s3_transfer():
"""
Moves excel files from an SFTP server to an S3 bucket using pyairbyte.
It uses source-sftp and destination-s3 to copy files in their original format.
"""
# Configure S3 destination
destination_s3 = ab.get_destination(
"destination-s3",
use_python=False,
install_if_missing=True,
use_host_network=True,
config={
"s3_bucket_name": env("S3_BUCKET_NAME"),
"s3_bucket_path": env("S3_BUCKET_PATH"),
# "access_key_id": env("AWS_ACCESS_KEY_ID"),
# "secret_access_key": env("AWS_SECRET_ACCESS_KEY"),
# "s3_endpoint": f"https://s3.{env('S3_BUCKET_REGION')}.amazonaws.com",
"s3_bucket_region": env("S3_BUCKET_REGION"),
"s3_endpoint": "",
"format": {
"flattening": "No flattening",
"compression": {"compression_type": "GZIP"},
"format_type": "JSONL",
},
},
)
print("Checking S3 destination connection...")
destination_s3.check()
print("S3 destination connection successful.")
if __name__ == "__main__":
run_sftp_to_s3_transfer()
the dockerfile
FROM python:3.12-slim-trixie
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
ENV UV_COMPILE_BYTECODE=1
ENV UV_LINK_MODE=copy
ENV UV_PYTHON_CACHE_DIR=/root/.cache/uv/python
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONPATH=/project
ENV UV_SYSTEM_PYTHON=1
ENV AIRBYTE_LOCAL_ROOT=/project/airbyte_local
ARG DOCKER_GID=999
# Install Docker CLI
RUN apt-get update \
&& apt-get install -y --no-install-recommends ca-certificates curl gnupg \
&& install -m 0755 -d /etc/apt/keyrings \
&& curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg \
&& chmod a+r /etc/apt/keyrings/docker.gpg \
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/debian $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null \
&& apt-get update \
&& apt-get install -y --no-install-recommends docker-ce-cli \
&& rm -rf /var/lib/apt/lists/*
RUN groupadd -g $DOCKER_GID docker && usermod -aG docker root
WORKDIR /project
RUN mkdir -p /project/airbyte_local
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync --locked --no-install-project
COPY . /project/
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --locked
ENTRYPOINT ["uv", "run", "data_jobs/sftp_excel_transfer.py"]
What am I missing?
Metadata
Metadata
Assignees
Labels
No labels