Replace 'aws s3 ls' shell-out in dataset.py with boto3

goanpeca · goanpeca · commit 4ea29bd7cec6 · 2026-06-01T22:31:17.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -163,4 +163,6 @@ cython_debug/
 *.wav
 wandb/*
 *.out
-test_*
+test_*
+# macOS
+.DS_Store
diff --git a/README.md b/README.md
@@ -171,6 +171,20 @@ The following properties are defined in the top level of the model configuration
 ## Dataset config
 `stable-audio-tools` currently supports two kinds of data sources: local directories of audio files, and WebDataset datasets stored in Amazon S3. More information can be found in [the dataset config documentation](docs/datasets.md)
 
+## S3-compatible storage (Backblaze B2)
+The S3 dataset loader uses `boto3`, which ships in the `train` extra. If you installed without that extra, add it with `pip install boto3` (or `pip install "stable-audio-tools[train]"`).
+
+The loader honors the `AWS_ENDPOINT_URL` environment variable, so you can point it at any S3-compatible host without changing the dataset config.
+
+Example for [Backblaze B2](https://github.com/backblaze-labs/):
+```bash
+export AWS_ENDPOINT_URL=https://s3.us-west-004.backblazeb2.com
+export AWS_ACCESS_KEY_ID=<B2 application key ID>
+export AWS_SECRET_ACCESS_KEY=<B2 application key>
+```
+
+When `AWS_ENDPOINT_URL` is unset, the loader uses default AWS S3, so existing setups are unaffected.
+
 # Todo
 - [ ] Add troubleshooting section
 - [ ] Add contribution guidelines 
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,6 +37,7 @@ dependencies = [
 [project.optional-dependencies]
 train = [
     "auraloss==0.4.0",
+    "boto3>=1.26",
     "descript-audio-codec==1.0.0",
     "encodec==0.1.1",
     "inf-cl",
diff --git a/stable_audio_tools/data/dataset.py b/stable_audio_tools/data/dataset.py
@@ -1,24 +1,25 @@
 import importlib
-import numpy as np
 import io
 import json
 import os
-import dill
 import posixpath
 import random
-import re
-import subprocess
+import shlex
 import time
+from functools import lru_cache
+from importlib.metadata import PackageNotFoundError, version
+from os import path
+from typing import Callable, List, Optional
+
+import dill
+import numpy as np
 import torch
 import torchaudio
 import webdataset as wds
-
-from os import path
 from torch import nn
 from torchaudio import transforms as T
-from typing import Optional, Callable, List
 
-from .utils import Stereo, Mono, PhaseFlipper, PadCrop_Normalized_T, VolumeNorm, strip_trailing_silence
+from .utils import Mono, PadCrop_Normalized_T, PhaseFlipper, Stereo, VolumeNorm, strip_trailing_silence
 
 AUDIO_KEYS = ("flac", "wav", "mp3", "m4a", "ogg", "opus")
 
@@ -483,65 +484,176 @@ def __getitem__(self, idx):
 
 # S3 code and WDS preprocessing code based on implementation by Scott Hawley originally in https://github.com/zqevans/audio-diffusion/blob/main/dataset/dataset.py
 
-def get_s3_contents(dataset_path, s3_url_prefix=None, filter='', recursive=True, debug=False, profile=None):
+@lru_cache(maxsize=1)
+def _user_agent():
+    "Base ``stable-audio-tools/<version>`` token, looked up once on first use."
+    try:
+        ver = version("stable-audio-tools")
+    except PackageNotFoundError:  # source/editable checkout without dist metadata
+        ver = "dev"
+    return f"stable-audio-tools/{ver}"
+
+
+def _build_user_agent_extra(user_agent_extra=None):
+    """``stable-audio-tools/<version>`` with any caller- or env-provided
+    (``STABLE_AUDIO_TOOLS_USER_AGENT_EXTRA``) value appended, not replacing it.
+    Pass ``user_agent_extra=""`` to suppress the env value and use the base only."""
+    base = _user_agent()
+    if user_agent_extra is None:
+        user_agent_extra = os.environ.get("STABLE_AUDIO_TOOLS_USER_AGENT_EXTRA")
+    return f"{base} {user_agent_extra}" if user_agent_extra else base
+
+
+@lru_cache(maxsize=32)
+def _build_s3_client(profile, endpoint_url, user_agent_extra):
+    try:
+        import boto3  # local import so boto3 is only required when S3 is used
+        from botocore.config import Config
+    except ModuleNotFoundError as e:
+        raise ImportError(
+            "S3 dataset access requires boto3. Install it with "
+            "'pip install boto3' or 'pip install stable-audio-tools[train]'."
+        ) from e
+
+    session = boto3.Session(profile_name=profile) if profile else boto3.Session()
+    return session.client(
+        "s3",
+        endpoint_url=endpoint_url,
+        config=Config(user_agent_extra=user_agent_extra),
+    )
+
+
+def _get_s3_client(profile=None, user_agent_extra=None):
+    """
+    Build (and reuse) a boto3 S3 client. Honors AWS_ENDPOINT_URL when set so the
+    same code path works against any S3-compatible endpoint (AWS S3 by default;
+    set AWS_ENDPOINT_URL to a Backblaze B2 endpoint to point it at B2). When the
+    env var is unset, behavior matches the default AWS client.
+
+    Clients are cached per (profile, endpoint, user-agent) so listing and
+    presigning share one client instead of building a new one on each call.
     """
-    Returns a list of full S3 paths to files in a given S3 bucket and directory path.
+    endpoint_url = os.environ.get("AWS_ENDPOINT_URL") or None
+    return _build_s3_client(
+        profile, endpoint_url, _build_user_agent_extra(user_agent_extra)
+    )
+
+
+def _parse_s3_url(url):
+    "Split an ``s3://bucket/key`` URL into (bucket, key). Raises ValueError otherwise."
+    if not url.startswith("s3://"):
+        raise ValueError(f"expected an s3:// URL, got: {url!r}")
+    bucket, _, key = url[len("s3://"):].partition("/")
+    if not bucket.strip():
+        raise ValueError(f"s3:// URL is missing a bucket name: {url!r}")
+    return bucket, key
+
+
+def get_s3_contents(
+    dataset_path,
+    s3_url_prefix=None,
+    filter='',           # deprecated alias for filter_str; kept for backwards compat
+    recursive=True,
+    debug=False,
+    profile=None,
+    relative=False,
+    filter_str=None,     # preferred substring filter
+):
     """
+    Returns a list of objects in a given S3 bucket and directory path.
+
+    By default returns full ``s3://bucket/key`` paths (backwards compatible with
+    the previous implementation). Pass ``relative=True`` to get keys relative to
+    ``dataset_path`` instead. Uses boto3 directly so it works against any
+    S3-compatible endpoint when ``AWS_ENDPOINT_URL`` is set.
+
+    ``filter_str`` is the preferred substring-filter argument; ``filter`` is kept
+    as a backwards-compatible alias.
+    """
+    if filter_str is None:
+        filter_str = filter
     # Ensure dataset_path ends with a trailing slash
     if dataset_path != '' and not dataset_path.endswith('/'):
         dataset_path += '/'
-    # Use posixpath to construct the S3 URL path
+    # Use posixpath to construct the S3 URL path (e.g. "s3://bucket/prefix/")
     bucket_path = posixpath.join(s3_url_prefix or '', dataset_path)
-    # Construct the `aws s3 ls` command
-    cmd = ['aws', 's3', 'ls', bucket_path]
 
-    if profile is not None:
-        cmd.extend(['--profile', profile])
+    bucket, prefix = _parse_s3_url(bucket_path)
+
+    s3 = _get_s3_client(profile=profile)
+    paginator = s3.get_paginator("list_objects_v2")
+    list_kwargs = {"Bucket": bucket, "Prefix": prefix}
+    if not recursive:
+        list_kwargs["Delimiter"] = "/"
+
+    keys = []
+    for page in paginator.paginate(**list_kwargs):
+        for obj in page.get("Contents", []) or []:
+            key = obj.get("Key", "")
+            if not key or key.endswith("/"):
+                continue
+            keys.append(key)
 
-    if recursive:
-        # Add the --recursive flag if requested
-        cmd.append('--recursive')
-    
-    # Run the `aws s3 ls` command and capture the output
-    run_ls = subprocess.run(cmd, capture_output=True, check=True)
-    # Split the output into lines and strip whitespace from each line
-    contents = run_ls.stdout.decode('utf-8').split('\n')
-    contents = [x.strip() for x in contents if x]
-    # Remove the timestamp from lines that begin with a timestamp
-    contents = [re.sub(r'^\S+\s+\S+\s+\d+\s+', '', x)
-                if re.match(r'^\S+\s+\S+\s+\d+\s+', x) else x for x in contents]
-    # Construct a full S3 path for each file in the contents list
-    contents = [posixpath.join(s3_url_prefix or '', x)
-                for x in contents if not x.endswith('/')]
     # Apply the filter, if specified
-    if filter:
-        contents = [x for x in contents if filter in x]
-    # Remove redundant directory names in the S3 URL
-    if recursive:
-        # Get the main directory name from the S3 URL
-        main_dir = "/".join(bucket_path.split('/')[3:])
-        # Remove the redundant directory names from each file path
-        contents = [x.replace(f'{main_dir}', '').replace(
-            '//', '/') for x in contents]
-    # Print debugging information, if requested
+    if filter_str:
+        keys = [k for k in keys if filter_str in k]
+
+    if relative:
+        # Strip the bucket-level prefix so keys are relative to dataset_path.
+        if prefix:
+            keys = [k[len(prefix):] if k.startswith(prefix) else k for k in keys]
+            keys = [k.lstrip('/') for k in keys]
+        contents = keys
+    else:
+        # Backwards-compatible default: full s3://bucket/key paths.
+        contents = [f"s3://{bucket}/{k}" for k in keys]
+
     if debug:
         print("contents = \n", contents)
-    # Return the list of S3 paths to files
+
     return contents
 
 
+# 7 days (SigV4 max) so shard URLs outlast long training runs. Override per
+# call or via STABLE_AUDIO_TOOLS_S3_PRESIGN_EXPIRY.
+_DEFAULT_PRESIGN_EXPIRY_SECONDS = 7 * 24 * 3600
+
+
 def get_all_s3_urls(
-    names=[],           # list of all valid [LAION AudioDataset] dataset names
-    # list of subsets you want from those datasets, e.g. ['train','valid']
-    subsets=[''],
+    names=None,         # list of [LAION AudioDataset] dataset names; None -> []
+    subsets=None,       # list of subsets, e.g. ['train','valid']; None -> ['']
     s3_url_prefix=None,  # prefix for those dataset names
     recursive=True,     # recursively list all tar files in all subdirs
     filter_str='tar',   # only grab files with this substring
     # print debugging info -- note: info displayed likely to change at dev's whims
     debug=False,
-    profiles={},        # dictionary of profiles for each item in names, e.g. {'dataset1': 'profile1', 'dataset2': 'profile2'}
+    profiles=None,      # dict of profiles per name, e.g. {'dataset1': 'profile1'}; None -> {}
+    presign_expiry_seconds=None,  # presigned-URL lifetime; None -> env var or default
 ):
-    "get urls of shards (tar files) for multiple datasets in one s3 bucket"
+    """Get urls of shards (tar files) for multiple datasets in one s3 bucket.
+
+    Shards are fetched via presigned URLs handed to ``curl``. The URL carries
+    short-lived credentials on the command line, so on shared/multi-tenant hosts
+    it can be visible to other local users (e.g. via ``ps``); keep
+    ``presign_expiry_seconds`` short in those environments.
+    """
+    names = [] if names is None else names
+    subsets = [''] if subsets is None else subsets
+    profiles = profiles or {}
+    if presign_expiry_seconds is None:
+        presign_expiry_seconds = os.environ.get(
+            "STABLE_AUDIO_TOOLS_S3_PRESIGN_EXPIRY", _DEFAULT_PRESIGN_EXPIRY_SECONDS)
+    try:
+        presign_expiry_seconds = int(presign_expiry_seconds)
+    except (TypeError, ValueError):
+        raise ValueError(
+            "presign_expiry_seconds (or STABLE_AUDIO_TOOLS_S3_PRESIGN_EXPIRY) must be "
+            f"an integer number of seconds, got: {presign_expiry_seconds!r}"
+        )
+    if presign_expiry_seconds <= 0:
+        raise ValueError(
+            f"presign_expiry_seconds must be positive, got: {presign_expiry_seconds}"
+        )
     urls = []
     for name in names:
         # If s3_url_prefix is not specified, assume the full S3 path is included in each element of the names list
@@ -559,23 +671,33 @@ def get_all_s3_urls(
             # Get the list of tar files in the current subset directory
             profile = profiles.get(name, None)
             tar_list = get_s3_contents(
-                subset_str, s3_url_prefix=None, recursive=recursive, filter=filter_str, debug=debug, profile=profile)
+                subset_str, s3_url_prefix=None, recursive=recursive, filter_str=filter_str, debug=debug, profile=profile, relative=True)
+            # Reuse the cached S3 client (shared with get_s3_contents) for presigning.
+            s3_client = _get_s3_client(profile=profile)
             for tar in tar_list:
-                # Escape spaces and parentheses in the tar filename for use in the shell command
-                tar = tar.replace(" ", "\ ").replace(
-                    "(", "\(").replace(")", "\)")
-                # Construct the S3 path to the current tar file
-                s3_path = posixpath.join(name, subset, tar) + " -"
-                # Construct the AWS CLI command to download the current tar file
+                # Construct the full s3:// URL for the current tar file.
                 if s3_url_prefix is None:
-                    request_str = f"pipe:aws s3 --cli-connect-timeout 0 cp {s3_path}"
+                    full_s3_url = posixpath.join(name, subset, tar)
                 else:
-                    request_str = f"pipe:aws s3 --cli-connect-timeout 0 cp {posixpath.join(s3_url_prefix, s3_path)}"
-                if profiles.get(name):
-                    request_str += f" --profile {profiles.get(name)}"
+                    full_s3_url = posixpath.join(s3_url_prefix, name, subset, tar)
+
+                bucket, key = _parse_s3_url(full_s3_url)
+
+                # Presigned GET URL works against AWS and any S3-compatible
+                # endpoint when AWS_ENDPOINT_URL is set. Expiry is configurable
+                # so long training runs do not outlive their shard URLs.
+                presigned = s3_client.generate_presigned_url(
+                    "get_object",
+                    Params={"Bucket": bucket, "Key": key},
+                    ExpiresIn=presign_expiry_seconds,
+                )
+                # --retry restores the transient-failure resilience the AWS CLI
+                # had; shlex.quote keeps URL contents safe in the pipe: shell command.
+                request_str = f"pipe:curl -fsSL --retry 5 {shlex.quote(presigned)}"
                 if debug:
-                    print("request_str = ", request_str)
-                # Add the constructed URL to the list of URLs
+                    # Strip the signed query string so signatures are not logged.
+                    redacted = presigned.split("?", 1)[0] + "?<redacted>"
+                    print("request_str = pipe:curl -fsSL --retry 5", shlex.quote(redacted))
                 urls.append(request_str)
     return urls
 
diff --git a/tests/test_dataset_s3.py b/tests/test_dataset_s3.py

-Original file line number
+Diff line change
 *.wav
 wandb/*
 *.out
 -test_*
 +test_*
 +# macOS
 +.DS_Store