From d9c530e5bdbb4bdbc144239511d7c91b26664895 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Fri, 5 Dec 2025 08:11:56 +0100 Subject: [PATCH 1/3] MAINT: Signature version 4 --- dtool_s3/storagebroker.py | 123 +++++++++++++++++++++++++++++++++++--- 1 file changed, 115 insertions(+), 8 deletions(-) diff --git a/dtool_s3/storagebroker.py b/dtool_s3/storagebroker.py index d47d5ee..1f9020d 100644 --- a/dtool_s3/storagebroker.py +++ b/dtool_s3/storagebroker.py @@ -290,6 +290,11 @@ def _get_resource_and_client(cls, bucket_name): unsigned_config = botocore.client.Config( signature_version=botocore.UNSIGNED) + # Use signature version 4 for presigned URLs - required for cross-network + # access where URLs are generated in one network (e.g., container) but + # used from another (e.g., host machine) + signed_config = botocore.client.Config(signature_version='s3v4') + if ( s3_endpoint is not None or s3_access_key_id is not None @@ -321,7 +326,8 @@ def _get_resource_and_client(cls, bucket_name): ) s3client = session.client( 's3', - endpoint_url=s3_endpoint + endpoint_url=s3_endpoint, + config=signed_config ) unsigned_s3client = boto3.client( 's3', @@ -330,7 +336,7 @@ def _get_resource_and_client(cls, bucket_name): ) else: s3resource = boto3.resource('s3') - s3client = boto3.client('s3') + s3client = boto3.client('s3', config=signed_config) unsigned_s3client = boto3.client('s3', config=unsigned_config) return s3resource, s3client, unsigned_s3client @@ -499,12 +505,21 @@ def get_admin_metadata(self): ).get() admin_metadata = response['Metadata'] - # s3-native metadata comes as str only, convert timestamps - # back to float: - if "frozen_at" in admin_metadata: - admin_metadata["frozen_at"] = float(admin_metadata["frozen_at"]) - if "created_at" in admin_metadata: - admin_metadata["created_at"] = float(admin_metadata["created_at"]) + + # If S3 metadata headers are empty (e.g., when uploaded via presigned URLs), + # fall back to reading from the object body + if not admin_metadata: + body = response['Body'].read().decode('utf-8') + if body: + admin_metadata = json.loads(body) + else: + # s3-native metadata comes as str only, convert timestamps + # back to float: + if "frozen_at" in admin_metadata: + admin_metadata["frozen_at"] = float(admin_metadata["frozen_at"]) + if "created_at" in admin_metadata: + admin_metadata["created_at"] = float(admin_metadata["created_at"]) + return admin_metadata def get_size_in_bytes(self, handle): @@ -769,6 +784,98 @@ def get_item_metadata(self, handle): return metadata + # Signed URL generation for dserver delegate access + + def generate_signed_read_url(self, key, expiry_seconds=3600): + """Generate a presigned URL for reading an object. + + :param key: S3 object key + :param expiry_seconds: Time in seconds for the presigned URL to remain valid + :returns: Presigned URL as string + """ + try: + url = self.s3client.generate_presigned_url( + 'get_object', + Params={'Bucket': self.bucket, 'Key': key}, + ExpiresIn=expiry_seconds + ) + return url + except botocore.exceptions.ClientError as e: + logger.error(f"Failed to generate signed read URL: {e}") + raise + + def generate_signed_write_url(self, key, expiry_seconds=3600): + """Generate a presigned URL for writing an object. + + :param key: S3 object key + :param expiry_seconds: Time in seconds for the presigned URL to remain valid + :returns: Presigned URL as string + """ + try: + url = self.s3client.generate_presigned_url( + 'put_object', + Params={'Bucket': self.bucket, 'Key': key}, + ExpiresIn=expiry_seconds + ) + return url + except botocore.exceptions.ClientError as e: + logger.error(f"Failed to generate signed write URL: {e}") + raise + + def generate_dataset_signed_urls(self, expiry_seconds=3600): + """Generate all signed URLs needed to access a dataset. + + This method returns a dictionary containing signed URLs for: + - admin_metadata_url: The dataset administrative metadata (dtool file) + - manifest_url: The manifest.json file + - readme_url: The README.yml file + - item_urls: Dictionary mapping item identifiers to signed URLs + - overlay_urls: Dictionary mapping overlay names to signed URLs + - annotation_urls: Dictionary mapping annotation names to signed URLs + + :param expiry_seconds: Time in seconds for the presigned URLs to remain valid + :returns: Dictionary containing all signed URLs for the dataset + """ + prefix = self._get_prefix() + + urls = { + 'admin_metadata_url': self.generate_signed_read_url( + self.get_admin_metadata_key(), expiry_seconds), + 'manifest_url': self.generate_signed_read_url( + self.get_manifest_key(), expiry_seconds), + 'readme_url': self.generate_signed_read_url( + self.get_readme_key(), expiry_seconds), + 'item_urls': {}, + 'overlay_urls': {}, + 'annotation_urls': {}, + 'tags': self.list_tags() + } + + # Generate URLs for all items + manifest = self.get_manifest() + for identifier in manifest.get('items', {}).keys(): + item_key = self.data_key_prefix + identifier + urls['item_urls'][identifier] = self.generate_signed_read_url( + item_key, expiry_seconds) + + # Generate URLs for all overlays + for overlay_name in self.list_overlay_names(): + overlay_key = self.get_overlay_key(overlay_name) + urls['overlay_urls'][overlay_name] = self.generate_signed_read_url( + overlay_key, expiry_seconds) + + # Generate URLs for all annotations + for annotation_name in self.list_annotation_names(): + annotation_key = self.get_annotation_key(annotation_name) + urls['annotation_urls'][annotation_name] = self.generate_signed_read_url( + annotation_key, expiry_seconds) + + return urls + + def supports_signing(self): + """Return True as S3 supports signed URL generation.""" + return True + # HTTP enabling functions def _create_presigned_url(self, object_name, expiration): """Generate a presigned URL to share an S3 object From 1c0925b10848e47ff4f24ef19a6ab258b2ab8b41 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Sun, 7 Dec 2025 23:25:46 +0100 Subject: [PATCH 2/3] BUILD: Switched build system to flit --- dtool_s3/__init__.py | 23 ++++++++++++++++++++-- pyproject.toml | 45 ++++++++++++++++++++++++++++++++++++++++++++ setup.cfg | 10 ---------- setup.py | 34 --------------------------------- 4 files changed, 66 insertions(+), 46 deletions(-) create mode 100644 pyproject.toml delete mode 100644 setup.cfg delete mode 100644 setup.py diff --git a/dtool_s3/__init__.py b/dtool_s3/__init__.py index 3159726..0fdd049 100644 --- a/dtool_s3/__init__.py +++ b/dtool_s3/__init__.py @@ -2,6 +2,25 @@ import logging -__version__ = "0.14.1" - logger = logging.getLogger(__name__) + +# workaround for diverging python versions: +try: + from importlib.metadata import version, PackageNotFoundError + logger.debug("imported version, PackageNotFoundError from importlib.metadata") +except ModuleNotFoundError: + from importlib_metadata import version, PackageNotFoundError + logger.debug("imported version, PackageNotFoundError from importlib_metadata") + +# first, try to determine dynamic version at runtime +try: + __version__ = version(__name__) + logger.debug("Determined version %s via importlib_metadata.version", __version__) +except PackageNotFoundError: + # if that fails, check for static version file written by setuptools_scm + try: + from .version import version as __version__ + logger.debug("Determined version %s from autogenerated dtool_s3/version.py", __version__) + except Exception as e: + logger.debug("All efforts to determine version failed: %s", e) + __version__ = None diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..3868bf9 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,45 @@ +[build-system] +requires = ["flit_scm"] +build-backend = "flit_scm:buildapi" + +[project] +name = "dtool-s3" +description = "Add S3 support to dtool" +readme = "README.rst" +license = {text = "MIT"} +authors = [ + {name = "Tjelvar Olsson", email = "tjelvar.olsson@gmail.com"} +] +dynamic = ["version"] +requires-python = ">=3.8" +dependencies = [ + "click", + "dtoolcore>=3.17", + "dtool_cli", + "boto3", + "packaging", +] + +[project.optional-dependencies] +test = [ + "pytest", + "pytest-cov", +] + +[project.urls] +Repository = "https://github.com/jic-dtool/dtool-s3" + +[project.entry-points."dtool.storage_brokers"] +S3StorageBroker = "dtool_s3.storagebroker:S3StorageBroker" + +[tool.flit.module] +name = "dtool_s3" + +[tool.setuptools_scm] +version_scheme = "guess-next-dev" +local_scheme = "no-local-version" +write_to = "dtool_s3/version.py" + +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "--cov=dtool_s3 --cov-report=term-missing" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 20291a5..0000000 --- a/setup.cfg +++ /dev/null @@ -1,10 +0,0 @@ -[flake8] -exclude=env*,.tox,.git,*.egg,build,docs - -[tool:pytest] -testpaths = tests -addopts = --cov=dtool_s3 -#addopts = -x --pdb - -[cov:run] -source = dtool_s3 diff --git a/setup.py b/setup.py deleted file mode 100644 index bc0c243..0000000 --- a/setup.py +++ /dev/null @@ -1,34 +0,0 @@ -from setuptools import setup - -url = "https://github.com/jic-dtool/dtool-s3" -version = "0.14.1" -readme = open('README.rst').read() - -setup( - name="dtool-s3", - packages=["dtool_s3"], - version=version, - description="Add S3 support to dtool", - long_description=readme, - include_package_data=True, - # Package will be released using Tjelvar's PyPi credentials. - author="Tjelvar Olsson", - author_email="tjelvar.olsson@gmail.com", -# author="Matthew Hartley", # NOQA -# author_email="matthew.hartley@jic.ac.uk", # NOQA - url=url, - download_url="{}/tarball/{}".format(url, version), - install_requires=[ - "click", - "dtoolcore>=3.17", - "dtool_cli", - "boto3", - "packaging", - ], - entry_points={ - "dtool.storage_brokers": [ - "S3StorageBroker=dtool_s3.storagebroker:S3StorageBroker", - ], - }, - license="MIT" -) From 22e73ccf4ae841aee89425bdbcb12a6af9957c36 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Mon, 8 Dec 2025 09:08:40 +0100 Subject: [PATCH 3/3] DOC: Updated CHANGELOG.rst and README.rst --- CHANGELOG.rst | 22 ++++++++++++++++---- README.rst | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e7ab1e6..071678a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,16 +4,30 @@ CHANGELOG This project uses `semantic versioning `_. This change log uses principles from `keep a changelog `_. -[Unreleased] ------------- +[0.15.0] - 2025-12-08 +--------------------- Added ^^^^^ +- Added ``generate_signed_read_url(key, expiry_seconds)`` method for generating + presigned URLs for reading objects from S3 +- Added ``generate_signed_write_url(key, expiry_seconds)`` method for generating + presigned URLs for writing objects to S3 +- Added ``generate_dataset_signed_urls(expiry_seconds)`` method for generating + all signed URLs needed to access a complete dataset (admin metadata, manifest, + README, items, overlays, annotations) +- Added ``supports_signing()`` method that returns ``True`` to indicate S3 + supports signed URL generation +- Added ``get_readme_key()`` method to retrieve the S3 key for the README file + Changed ^^^^^^^ +- Switched build system to flit +- Updated to use AWS Signature Version 4 for presigned URLs + Deprecated ^^^^^^^^^^ @@ -58,7 +72,7 @@ Added with this version of dtool. It is not anticipated that anyone encounter this scenario as proto datasets are more or less ephemeral when datasets are copied to s3. This feature fixes https://github.com/jic-dtool/dtool-s3/issues/14. - Thanks to `Johannes L. H�rmann `_ and `Lars + Thanks to `Johannes L. H�rmann `_ and `Lars Pastewka `_ for reporting this issue. @@ -84,7 +98,7 @@ Fixed - Fixed long standing issue with ``created_at`` and ``frozen_at`` admin metadata being returned as string rather than float. Many thanks to - `Johannes L. H�rmann `_ for reporting and fixing. + `Johannes L. H�rmann `_ for reporting and fixing. See https://github.com/jic-dtool/dtool-s3/pull/13. diff --git a/README.rst b/README.rst index 47012c4..fbf7389 100644 --- a/README.rst +++ b/README.rst @@ -15,6 +15,7 @@ Features - Copy datasets to and from S3 object storage - List all the datasets in a S3 bucket - Create datasets directly in S3 +- Generate presigned URLs for secure, time-limited access to datasets Installation ------------ @@ -175,6 +176,61 @@ Those are the registration keys that are not stored under the configured prefix. The registration keys contain the prefix where the respective dataset is found. They are empty if no prefix is configured. + +Signed URLs for programmatic access +----------------------------------- + +The S3 storage broker provides methods for generating presigned URLs, enabling +secure, time-limited access to datasets without sharing AWS credentials. This +is particularly useful for server applications like ``dserver`` that need to +delegate dataset access to clients. + +The following methods are available on the storage broker: + +``generate_signed_read_url(key, expiry_seconds=3600)`` + Generate a presigned URL for reading a single object. + +``generate_signed_write_url(key, expiry_seconds=3600)`` + Generate a presigned URL for writing a single object. + +``generate_dataset_signed_urls(expiry_seconds=3600)`` + Generate presigned URLs for all components of a dataset (admin metadata, + manifest, README, items, overlays, and annotations). + +``supports_signing()`` + Returns ``True`` to indicate that S3 supports signed URL generation. + +Example usage:: + + import dtoolcore + + # Load an existing dataset + dataset = dtoolcore.DataSet.from_uri("s3://my-bucket/my-dataset-uuid") + + # Access the storage broker + storage_broker = dataset._storage_broker + + # Check if signing is supported + if storage_broker.supports_signing(): + # Generate URLs for the entire dataset (valid for 1 hour) + urls = storage_broker.generate_dataset_signed_urls(expiry_seconds=3600) + + # URLs dictionary contains: + # - 'admin_metadata': URL to read admin metadata + # - 'manifest': URL to read manifest + # - 'readme': URL to read README + # - 'items': dict mapping identifier -> URL for each item + # - 'overlays': dict mapping overlay_name -> URL + # - 'annotations': dict mapping annotation_name -> URL + + # Generate a URL for a single item + item_key = storage_broker.data_key_prefix + item_identifier + item_url = storage_broker.generate_signed_read_url(item_key) + +These methods are used by ``dserver-signed-url-plugin`` to provide secure +dataset access through a REST API. + + Testing -------