|
1 | | -from copy import copy |
| 1 | +from __future__ import annotations |
2 | 2 |
|
3 | | -from squidpy.datasets._utils import AMetadata |
| 3 | +import os |
| 4 | +from dataclasses import dataclass |
| 5 | +from pathlib import Path |
| 6 | +from urllib.error import HTTPError, URLError |
| 7 | +from urllib.parse import urlparse |
| 8 | +from urllib.request import Request, urlopen |
4 | 9 |
|
5 | | -_codex_mouse_spleen = AMetadata( |
| 10 | +import anndata as ad |
| 11 | + |
| 12 | +try: |
| 13 | + from tqdm.auto import tqdm |
| 14 | +except ImportError: # pragma: no cover - optional dependency |
| 15 | + tqdm = None |
| 16 | + |
| 17 | + |
| 18 | +@dataclass(frozen=True) |
| 19 | +class DatasetMetadata: |
| 20 | + name: str |
| 21 | + doc_header: str |
| 22 | + shape: tuple[int, int] |
| 23 | + url: str |
| 24 | + filename: str |
| 25 | + |
| 26 | + |
| 27 | +_codex_mouse_spleen = DatasetMetadata( |
6 | 28 | name="codex_mouse_spleen", |
7 | 29 | doc_header="Pre-processed CODEX dataset of mouse spleen from `Goltsev et al " |
8 | 30 | "<https://doi.org/10.1016/j.cell.2018.07.010>`__.", |
9 | 31 | shape=(707474, 29), |
10 | | - url="https://figshare.com/ndownloader/files/38538101", |
| 32 | + url="https://ndownloader.figshare.com/files/38538101", |
| 33 | + filename="codex_mouse_spleen.h5ad", |
11 | 34 | ) |
12 | 35 |
|
13 | | -for name, var in copy(locals()).items(): |
14 | | - if isinstance(var, AMetadata): |
15 | | - var._create_function(name, globals()) |
| 36 | + |
| 37 | +def _default_datasets_dir() -> Path: |
| 38 | + return Path(os.environ.get("CELLCHARTER_DATA_DIR", Path.home() / ".cache" / "cellcharter" / "datasets")) |
| 39 | + |
| 40 | + |
| 41 | +def _is_hdf5_file(path: Path) -> bool: |
| 42 | + if not path.exists() or path.stat().st_size < 8: |
| 43 | + return False |
| 44 | + with path.open("rb") as input_file: |
| 45 | + return input_file.read(8) == b"\x89HDF\r\n\x1a\n" |
| 46 | + |
| 47 | + |
| 48 | +def _normalize_figshare_url(url: str) -> str: |
| 49 | + parsed = urlparse(url) |
| 50 | + if parsed.netloc == "figshare.com" and parsed.path.startswith("/ndownloader/files/"): |
| 51 | + # Strip the /ndownloader prefix, keeping only /files/... |
| 52 | + return f"https://ndownloader.figshare.com{parsed.path.removeprefix('/ndownloader')}" |
| 53 | + return url |
| 54 | + |
| 55 | + |
| 56 | +def _download_file(url: str, destination: Path) -> None: |
| 57 | + destination.parent.mkdir(parents=True, exist_ok=True) |
| 58 | + tmp_destination = destination.with_suffix(f"{destination.suffix}.part") |
| 59 | + |
| 60 | + request = Request(_normalize_figshare_url(url), headers={"User-Agent": "cellcharter-datasets"}) |
| 61 | + progress = None |
| 62 | + try: |
| 63 | + with urlopen(request, timeout=60) as response, tmp_destination.open("wb") as output: |
| 64 | + content_length = response.headers.get("Content-Length") |
| 65 | + total = int(content_length) if content_length and content_length.isdigit() else None |
| 66 | + if tqdm is not None: |
| 67 | + progress = tqdm(total=total, unit="B", unit_scale=True, desc=f"Downloading {destination.name}") |
| 68 | + |
| 69 | + chunk_size = 1024 * 1024 |
| 70 | + while True: |
| 71 | + chunk = response.read(chunk_size) |
| 72 | + if not chunk: |
| 73 | + break |
| 74 | + output.write(chunk) |
| 75 | + if progress is not None: |
| 76 | + progress.update(len(chunk)) |
| 77 | + except Exception as error: # noqa: B902 |
| 78 | + if tmp_destination.exists(): |
| 79 | + tmp_destination.unlink() |
| 80 | + if isinstance(error, (HTTPError, URLError)): |
| 81 | + raise RuntimeError(f"Failed to download dataset from {url}.") from error |
| 82 | + raise error |
| 83 | + finally: |
| 84 | + if progress is not None: |
| 85 | + progress.close() |
| 86 | + |
| 87 | + if not _is_hdf5_file(tmp_destination): |
| 88 | + tmp_destination.unlink(missing_ok=True) |
| 89 | + raise RuntimeError( |
| 90 | + f"Downloaded file from {url} is not a valid HDF5 file. " "Please check network access and dataset URL." |
| 91 | + ) |
| 92 | + |
| 93 | + tmp_destination.replace(destination) |
| 94 | + |
| 95 | + |
| 96 | +def _resolve_dataset_path(metadata: DatasetMetadata, path: str | Path | None = None) -> Path: |
| 97 | + base_dir = _default_datasets_dir() if path is None else Path(path) |
| 98 | + if base_dir.suffix: |
| 99 | + return base_dir |
| 100 | + return base_dir / metadata.filename |
| 101 | + |
| 102 | + |
| 103 | +def _fetch_dataset_file( |
| 104 | + metadata: DatasetMetadata, path: str | Path | None = None, force_download: bool = False |
| 105 | +) -> Path: |
| 106 | + dataset_path = _resolve_dataset_path(metadata, path=path) |
| 107 | + should_download = force_download or not dataset_path.exists() or not _is_hdf5_file(dataset_path) |
| 108 | + if should_download: |
| 109 | + dataset_path.unlink(missing_ok=True) |
| 110 | + _download_file(metadata.url, dataset_path) |
| 111 | + return dataset_path |
| 112 | + |
| 113 | + |
| 114 | +def codex_mouse_spleen(path: str | Path | None = None, force_download: bool = False) -> ad.AnnData: |
| 115 | + """Pre-processed CODEX dataset of mouse spleen from `Goltsev et al <https://doi.org/10.1016/j.cell.2018.07.010>`__.""" |
| 116 | + dataset_path = _fetch_dataset_file(_codex_mouse_spleen, path=path, force_download=force_download) |
| 117 | + return ad.read_h5ad(dataset_path) |
16 | 118 |
|
17 | 119 |
|
18 | | -__all__ = ["codex_mouse_spleen"] # noqa: F822 |
| 120 | +__all__ = ["codex_mouse_spleen"] |
0 commit comments