Skip to content

Commit 6dea763

Browse files
authored
Merge pull request #111 from CSOgroup/110-dataset-import
Fix datasets import failing by developing standalone dataset system
2 parents 461165f + 27cd77d commit 6dea763

File tree

4 files changed

+156
-21
lines changed

4 files changed

+156
-21
lines changed

.github/workflows/test.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@ jobs:
2424
matrix:
2525
include:
2626
- os: ubuntu-latest
27-
python: "3.10"
27+
python: "3.11"
2828
- os: ubuntu-latest
29-
python: "3.12"
29+
python: "3.13"
3030
- os: ubuntu-latest
31-
python: "3.12"
31+
python: "3.13"
3232
pip-flags: "--pre"
3333
name: PRE-RELEASE DEPENDENCIES
3434

pyproject.toml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@ exclude = [
1010

1111
[project]
1212
name = "cellcharter"
13-
version = "0.3.5"
13+
version = "0.3.6"
1414
description = "A Python package for the identification, characterization and comparison of spatial clusters from spatial -omics data."
1515
readme = "README.md"
16-
requires-python = ">=3.10,<3.14"
16+
requires-python = ">=3.11,<3.14"
1717
license = {file = "LICENSE"}
1818
authors = [
1919
{name = "CSO group"},
@@ -28,13 +28,13 @@ dependencies = [
2828
"anndata",
2929
"scikit-learn",
3030
"squidpy >= 1.6.3",
31-
"torchgmm >= 0.1.2",
31+
"torchgmm >= 0.1.4",
3232
# for debug logging (referenced from the issue template)
3333
"session-info",
34-
"spatialdata",
35-
"spatialdata-plot",
3634
"rasterio",
3735
"sknw",
36+
"spatialdata",
37+
"spatialdata-plot",
3838
]
3939

4040
[project.optional-dependencies]
Lines changed: 110 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,120 @@
1-
from copy import copy
1+
from __future__ import annotations
22

3-
from squidpy.datasets._utils import AMetadata
3+
import os
4+
from dataclasses import dataclass
5+
from pathlib import Path
6+
from urllib.error import HTTPError, URLError
7+
from urllib.parse import urlparse
8+
from urllib.request import Request, urlopen
49

5-
_codex_mouse_spleen = AMetadata(
10+
import anndata as ad
11+
12+
try:
13+
from tqdm.auto import tqdm
14+
except ImportError: # pragma: no cover - optional dependency
15+
tqdm = None
16+
17+
18+
@dataclass(frozen=True)
19+
class DatasetMetadata:
20+
name: str
21+
doc_header: str
22+
shape: tuple[int, int]
23+
url: str
24+
filename: str
25+
26+
27+
_codex_mouse_spleen = DatasetMetadata(
628
name="codex_mouse_spleen",
729
doc_header="Pre-processed CODEX dataset of mouse spleen from `Goltsev et al "
830
"<https://doi.org/10.1016/j.cell.2018.07.010>`__.",
931
shape=(707474, 29),
10-
url="https://figshare.com/ndownloader/files/38538101",
32+
url="https://ndownloader.figshare.com/files/38538101",
33+
filename="codex_mouse_spleen.h5ad",
1134
)
1235

13-
for name, var in copy(locals()).items():
14-
if isinstance(var, AMetadata):
15-
var._create_function(name, globals())
36+
37+
def _default_datasets_dir() -> Path:
38+
return Path(os.environ.get("CELLCHARTER_DATA_DIR", Path.home() / ".cache" / "cellcharter" / "datasets"))
39+
40+
41+
def _is_hdf5_file(path: Path) -> bool:
42+
if not path.exists() or path.stat().st_size < 8:
43+
return False
44+
with path.open("rb") as input_file:
45+
return input_file.read(8) == b"\x89HDF\r\n\x1a\n"
46+
47+
48+
def _normalize_figshare_url(url: str) -> str:
49+
parsed = urlparse(url)
50+
if parsed.netloc == "figshare.com" and parsed.path.startswith("/ndownloader/files/"):
51+
# Strip the /ndownloader prefix, keeping only /files/...
52+
return f"https://ndownloader.figshare.com{parsed.path.removeprefix('/ndownloader')}"
53+
return url
54+
55+
56+
def _download_file(url: str, destination: Path) -> None:
57+
destination.parent.mkdir(parents=True, exist_ok=True)
58+
tmp_destination = destination.with_suffix(f"{destination.suffix}.part")
59+
60+
request = Request(_normalize_figshare_url(url), headers={"User-Agent": "cellcharter-datasets"})
61+
progress = None
62+
try:
63+
with urlopen(request, timeout=60) as response, tmp_destination.open("wb") as output:
64+
content_length = response.headers.get("Content-Length")
65+
total = int(content_length) if content_length and content_length.isdigit() else None
66+
if tqdm is not None:
67+
progress = tqdm(total=total, unit="B", unit_scale=True, desc=f"Downloading {destination.name}")
68+
69+
chunk_size = 1024 * 1024
70+
while True:
71+
chunk = response.read(chunk_size)
72+
if not chunk:
73+
break
74+
output.write(chunk)
75+
if progress is not None:
76+
progress.update(len(chunk))
77+
except Exception as error: # noqa: B902
78+
if tmp_destination.exists():
79+
tmp_destination.unlink()
80+
if isinstance(error, (HTTPError, URLError)):
81+
raise RuntimeError(f"Failed to download dataset from {url}.") from error
82+
raise error
83+
finally:
84+
if progress is not None:
85+
progress.close()
86+
87+
if not _is_hdf5_file(tmp_destination):
88+
tmp_destination.unlink(missing_ok=True)
89+
raise RuntimeError(
90+
f"Downloaded file from {url} is not a valid HDF5 file. " "Please check network access and dataset URL."
91+
)
92+
93+
tmp_destination.replace(destination)
94+
95+
96+
def _resolve_dataset_path(metadata: DatasetMetadata, path: str | Path | None = None) -> Path:
97+
base_dir = _default_datasets_dir() if path is None else Path(path)
98+
if base_dir.suffix:
99+
return base_dir
100+
return base_dir / metadata.filename
101+
102+
103+
def _fetch_dataset_file(
104+
metadata: DatasetMetadata, path: str | Path | None = None, force_download: bool = False
105+
) -> Path:
106+
dataset_path = _resolve_dataset_path(metadata, path=path)
107+
should_download = force_download or not dataset_path.exists() or not _is_hdf5_file(dataset_path)
108+
if should_download:
109+
dataset_path.unlink(missing_ok=True)
110+
_download_file(metadata.url, dataset_path)
111+
return dataset_path
112+
113+
114+
def codex_mouse_spleen(path: str | Path | None = None, force_download: bool = False) -> ad.AnnData:
115+
"""Pre-processed CODEX dataset of mouse spleen from `Goltsev et al <https://doi.org/10.1016/j.cell.2018.07.010>`__."""
116+
dataset_path = _fetch_dataset_file(_codex_mouse_spleen, path=path, force_download=force_download)
117+
return ad.read_h5ad(dataset_path)
16118

17119

18-
__all__ = ["codex_mouse_spleen"] # noqa: F822
120+
__all__ = ["codex_mouse_spleen"]

tests/conftest.py

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import time
2-
from urllib.error import HTTPError
2+
from pathlib import Path
3+
from urllib.error import HTTPError, URLError
4+
from urllib.request import Request, urlopen
35

46
import anndata as ad
57
import numpy as np
@@ -9,6 +11,35 @@
911

1012
_adata = sc.read("tests/_data/test_data.h5ad")
1113
_adata.raw = _adata.copy()
14+
_CODEX_PATH = Path("tests/_data/codex_adata.h5ad")
15+
_CODEX_URL = "https://ndownloader.figshare.com/files/46832722"
16+
17+
18+
def _is_hdf5_file(path: Path) -> bool:
19+
if not path.exists() or path.stat().st_size < 8:
20+
return False
21+
with path.open("rb") as input_file:
22+
return input_file.read(8) == b"\x89HDF\r\n\x1a\n"
23+
24+
25+
def _download_codex(path: Path) -> None:
26+
path.parent.mkdir(parents=True, exist_ok=True)
27+
tmp_path = path.with_suffix(f"{path.suffix}.part")
28+
request = Request(_CODEX_URL, headers={"User-Agent": "cellcharter-tests"})
29+
30+
with urlopen(request, timeout=60) as response, tmp_path.open("wb") as output:
31+
chunk_size = 1024 * 1024
32+
while True:
33+
chunk = response.read(chunk_size)
34+
if not chunk:
35+
break
36+
output.write(chunk)
37+
38+
if not _is_hdf5_file(tmp_path):
39+
tmp_path.unlink(missing_ok=True)
40+
raise OSError("Downloaded codex fixture is not a valid HDF5 file.")
41+
42+
tmp_path.replace(path)
1243

1344

1445
@pytest.fixture()
@@ -31,12 +62,14 @@ def codex_adata() -> ad.AnnData:
3162

3263
for attempt in range(max_retries):
3364
try:
34-
adata = sc.read(
35-
"tests/_data/codex_adata.h5ad", backup_url="https://figshare.com/ndownloader/files/46832722"
36-
)
65+
if not _is_hdf5_file(_CODEX_PATH):
66+
_CODEX_PATH.unlink(missing_ok=True)
67+
_download_codex(_CODEX_PATH)
68+
adata = ad.read_h5ad(_CODEX_PATH)
3769
adata.obs_names_make_unique()
3870
return adata[adata.obs["sample"].isin(["BALBc-1", "MRL-5"])].copy()
39-
except HTTPError as e:
71+
except (HTTPError, URLError, OSError) as e:
72+
_CODEX_PATH.unlink(missing_ok=True) # Force re-download on next attempt
4073
if attempt == max_retries - 1: # Last attempt
4174
pytest.skip(f"Failed to download test data after {max_retries} attempts: {str(e)}")
4275
time.sleep(retry_delay)

0 commit comments

Comments
 (0)