|
| 1 | +# /// script |
| 2 | +# dependencies = [] |
| 3 | +# requires-python = ">=3.12" |
| 4 | +# /// |
| 5 | +"""Download a sub-01 subset of OpenNeuro ds000114 for longitudinal tests. |
| 6 | +
|
| 7 | +ds000114 ("Test-Retest Reliability") is the canonical multi-session BIDS |
| 8 | +demo dataset. We pull sub-01 only (both ses-test and ses-retest) plus the |
| 9 | +top-level sidecars, because BIDS inheritance places RepetitionTime etc. |
| 10 | +in the dataset-level task JSON, not per-subject. |
| 11 | +
|
| 12 | +OpenNeuro snapshot: 1.0.2 (tagged 2022-08-24, verified to contain |
| 13 | +``TaskName`` in ``task-fingerfootlips_bold.json`` so bids-validator is |
| 14 | +happy). The public S3 mirror only exposes the latest state at top-level |
| 15 | +(``s3://openneuro.org/ds000114/...``), so we can't pin via a snapshot |
| 16 | +path. Instead we verify the sha256 of ``task-fingerfootlips_bold.json`` |
| 17 | +after download and refuse to proceed if the content has drifted. |
| 18 | +
|
| 19 | +Runs against the public HTTP S3 endpoint with no credentials and no |
| 20 | +third-party dependencies, so it works on any environment where |
| 21 | +``uv run`` does. |
| 22 | +
|
| 23 | +Usage:: |
| 24 | +
|
| 25 | + uv run scripts/download_ds000114.py [TARGET_DIR] |
| 26 | +
|
| 27 | +Default TARGET_DIR is ``tests/data/ds000114`` (under the repo root). |
| 28 | +Idempotent: exits early if the sentinel T1w file already exists. |
| 29 | +""" |
| 30 | + |
| 31 | +from __future__ import annotations |
| 32 | + |
| 33 | +import argparse |
| 34 | +import hashlib |
| 35 | +import logging |
| 36 | +import sys |
| 37 | +import urllib.error |
| 38 | +import urllib.parse |
| 39 | +import urllib.request |
| 40 | +import xml.etree.ElementTree as ET |
| 41 | +from pathlib import Path |
| 42 | + |
| 43 | +SNAPSHOT_TAG = "1.0.2" |
| 44 | +S3_BASE = "https://s3.amazonaws.com/openneuro.org" |
| 45 | +DATASET = "ds000114" |
| 46 | +SUBJECT = "sub-01" |
| 47 | +SIDECARS = ( |
| 48 | + "dataset_description.json", |
| 49 | + "participants.tsv", |
| 50 | + "task-fingerfootlips_bold.json", |
| 51 | +) |
| 52 | +TASK_JSON_SHA256 = "9fd44f65a772e05282c20bdfa2a9775e02f9a7f562c5c96bbf4fd30632540355" |
| 53 | +S3_XMLNS = "{http://s3.amazonaws.com/doc/2006-03-01/}" |
| 54 | + |
| 55 | +_DEFAULT_TARGET = Path(__file__).resolve().parent.parent / "tests" / "data" / DATASET |
| 56 | +_SENTINEL_REL = Path(SUBJECT) / "ses-test" / "anat" / f"{SUBJECT}_ses-test_T1w.nii.gz" |
| 57 | + |
| 58 | +_logger = logging.getLogger("download_ds000114") |
| 59 | + |
| 60 | + |
| 61 | +def _list_keys(prefix: str) -> list[str]: |
| 62 | + """Enumerate all S3 object keys beneath ``prefix`` (paginated).""" |
| 63 | + keys: list[str] = [] |
| 64 | + continuation: str | None = None |
| 65 | + while True: |
| 66 | + query = f"list-type=2&prefix={prefix}" |
| 67 | + if continuation is not None: |
| 68 | + query += f"&continuation-token={urllib.parse.quote(continuation)}" |
| 69 | + with urllib.request.urlopen(f"{S3_BASE}/?{query}") as resp: # noqa: S310 |
| 70 | + root = ET.parse(resp).getroot() # noqa: S314 (trusted S3 response) |
| 71 | + for contents in root.findall(f"{S3_XMLNS}Contents"): |
| 72 | + key = contents.findtext(f"{S3_XMLNS}Key") |
| 73 | + if key: |
| 74 | + keys.append(key) |
| 75 | + if root.findtext(f"{S3_XMLNS}IsTruncated") != "true": |
| 76 | + return keys |
| 77 | + continuation = root.findtext(f"{S3_XMLNS}NextContinuationToken") |
| 78 | + if not continuation: |
| 79 | + return keys |
| 80 | + |
| 81 | + |
| 82 | +def _download(key: str, dest: Path) -> None: |
| 83 | + """Fetch ``key`` from the public openneuro S3 mirror to ``dest``.""" |
| 84 | + dest.parent.mkdir(parents=True, exist_ok=True) |
| 85 | + url = f"{S3_BASE}/{key}" |
| 86 | + _logger.info("GET %s", key) |
| 87 | + with ( |
| 88 | + urllib.request.urlopen(url) as resp, # noqa: S310 |
| 89 | + dest.open("wb") as out, |
| 90 | + ): |
| 91 | + while chunk := resp.read(1 << 20): |
| 92 | + out.write(chunk) |
| 93 | + |
| 94 | + |
| 95 | +def _sha256(path: Path) -> str: |
| 96 | + digest = hashlib.sha256() |
| 97 | + with path.open("rb") as fh: |
| 98 | + while chunk := fh.read(1 << 20): |
| 99 | + digest.update(chunk) |
| 100 | + return digest.hexdigest() |
| 101 | + |
| 102 | + |
| 103 | +def download(target_dir: Path) -> None: |
| 104 | + """Download the ds000114 sub-01 subset into ``target_dir`` if missing.""" |
| 105 | + sentinel = target_dir / _SENTINEL_REL |
| 106 | + if sentinel.exists(): |
| 107 | + _logger.info("%s: already present at %s", DATASET, target_dir) |
| 108 | + return |
| 109 | + |
| 110 | + _logger.info( |
| 111 | + "%s: downloading %s (both sessions) + sidecars to %s (snapshot %s)", |
| 112 | + DATASET, |
| 113 | + SUBJECT, |
| 114 | + target_dir, |
| 115 | + SNAPSHOT_TAG, |
| 116 | + ) |
| 117 | + |
| 118 | + subject_keys = _list_keys(f"{DATASET}/{SUBJECT}/") |
| 119 | + if not subject_keys: |
| 120 | + raise RuntimeError( |
| 121 | + f"No objects found under {DATASET}/{SUBJECT}/; bucket layout may " |
| 122 | + "have changed." |
| 123 | + ) |
| 124 | + for key in subject_keys: |
| 125 | + rel = Path(key).relative_to(DATASET) |
| 126 | + _download(key, target_dir / rel) |
| 127 | + |
| 128 | + for name in SIDECARS: |
| 129 | + _download(f"{DATASET}/{name}", target_dir / name) |
| 130 | + |
| 131 | + actual = _sha256(target_dir / "task-fingerfootlips_bold.json") |
| 132 | + if actual != TASK_JSON_SHA256: |
| 133 | + raise RuntimeError( |
| 134 | + "task-fingerfootlips_bold.json sha256 mismatch; upstream may have " |
| 135 | + f"changed.\n expected: {TASK_JSON_SHA256}\n actual: {actual}\n" |
| 136 | + f" verify and update SNAPSHOT_TAG + TASK_JSON_SHA256." |
| 137 | + ) |
| 138 | + _logger.info("%s: download complete", DATASET) |
| 139 | + |
| 140 | + |
| 141 | +def main(argv: list[str] | None = None) -> int: |
| 142 | + """Entry point. Returns a shell-style exit code.""" |
| 143 | + parser = argparse.ArgumentParser(description=__doc__.splitlines()[0]) |
| 144 | + parser.add_argument( |
| 145 | + "target_dir", |
| 146 | + nargs="?", |
| 147 | + type=Path, |
| 148 | + default=_DEFAULT_TARGET, |
| 149 | + help=f"Destination directory (default: {_DEFAULT_TARGET}).", |
| 150 | + ) |
| 151 | + args = parser.parse_args(argv) |
| 152 | + |
| 153 | + logging.basicConfig(level=logging.INFO, format="%(message)s", stream=sys.stderr) |
| 154 | + try: |
| 155 | + download(args.target_dir) |
| 156 | + except urllib.error.URLError as exc: |
| 157 | + _logger.error("%s: network error: %s", DATASET, exc) |
| 158 | + return 2 |
| 159 | + except RuntimeError as exc: |
| 160 | + _logger.error("%s: %s", DATASET, exc) |
| 161 | + return 3 |
| 162 | + return 0 |
| 163 | + |
| 164 | + |
| 165 | +if __name__ == "__main__": |
| 166 | + raise SystemExit(main()) |
0 commit comments