Merge pull request #111 from CSOgroup/110-dataset-import

marcovarrone · web-flow · commit 6dea76364386 · 2026-02-12T12:06:46.000+01:00
Fix datasets import failing by developing standalone dataset system
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -24,11 +24,11 @@ jobs:
       matrix:
         include:
           - os: ubuntu-latest
-            python: "3.10"
+            python: "3.11"
           - os: ubuntu-latest
-            python: "3.12"
+            python: "3.13"
           - os: ubuntu-latest
-            python: "3.12"
+            python: "3.13"
             pip-flags: "--pre"
             name: PRE-RELEASE DEPENDENCIES
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,10 +10,10 @@ exclude = [
 
 [project]
 name = "cellcharter"
-version = "0.3.5"
+version = "0.3.6"
 description = "A Python package for the identification, characterization and comparison of spatial clusters from spatial -omics data."
 readme = "README.md"
-requires-python = ">=3.10,<3.14"
+requires-python = ">=3.11,<3.14"
 license = {file = "LICENSE"}
 authors = [
     {name = "CSO group"},
@@ -28,13 +28,13 @@ dependencies = [
     "anndata",
     "scikit-learn",
     "squidpy >= 1.6.3",
-    "torchgmm >= 0.1.2",
+    "torchgmm >= 0.1.4",
     # for debug logging (referenced from the issue template)
     "session-info",
-    "spatialdata",
-    "spatialdata-plot",
     "rasterio",
     "sknw",
+    "spatialdata",
+    "spatialdata-plot",
 ]
 
 [project.optional-dependencies]
diff --git a/src/cellcharter/datasets/_dataset.py b/src/cellcharter/datasets/_dataset.py
@@ -1,18 +1,120 @@
-from copy import copy
+from __future__ import annotations
 
-from squidpy.datasets._utils import AMetadata
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from urllib.error import HTTPError, URLError
+from urllib.parse import urlparse
+from urllib.request import Request, urlopen
 
-_codex_mouse_spleen = AMetadata(
+import anndata as ad
+
+try:
+    from tqdm.auto import tqdm
+except ImportError:  # pragma: no cover - optional dependency
+    tqdm = None
+
+
+@dataclass(frozen=True)
+class DatasetMetadata:
+    name: str
+    doc_header: str
+    shape: tuple[int, int]
+    url: str
+    filename: str
+
+
+_codex_mouse_spleen = DatasetMetadata(
     name="codex_mouse_spleen",
     doc_header="Pre-processed CODEX dataset of mouse spleen from `Goltsev et al "
     "<https://doi.org/10.1016/j.cell.2018.07.010>`__.",
     shape=(707474, 29),
-    url="https://figshare.com/ndownloader/files/38538101",
+    url="https://ndownloader.figshare.com/files/38538101",
+    filename="codex_mouse_spleen.h5ad",
 )
 
-for name, var in copy(locals()).items():
-    if isinstance(var, AMetadata):
-        var._create_function(name, globals())
+
+def _default_datasets_dir() -> Path:
+    return Path(os.environ.get("CELLCHARTER_DATA_DIR", Path.home() / ".cache" / "cellcharter" / "datasets"))
+
+
+def _is_hdf5_file(path: Path) -> bool:
+    if not path.exists() or path.stat().st_size < 8:
+        return False
+    with path.open("rb") as input_file:
+        return input_file.read(8) == b"\x89HDF\r\n\x1a\n"
+
+
+def _normalize_figshare_url(url: str) -> str:
+    parsed = urlparse(url)
+    if parsed.netloc == "figshare.com" and parsed.path.startswith("/ndownloader/files/"):
+        # Strip the /ndownloader prefix, keeping only /files/...
+        return f"https://ndownloader.figshare.com{parsed.path.removeprefix('/ndownloader')}"
+    return url
+
+
+def _download_file(url: str, destination: Path) -> None:
+    destination.parent.mkdir(parents=True, exist_ok=True)
+    tmp_destination = destination.with_suffix(f"{destination.suffix}.part")
+
+    request = Request(_normalize_figshare_url(url), headers={"User-Agent": "cellcharter-datasets"})
+    progress = None
+    try:
+        with urlopen(request, timeout=60) as response, tmp_destination.open("wb") as output:
+            content_length = response.headers.get("Content-Length")
+            total = int(content_length) if content_length and content_length.isdigit() else None
+            if tqdm is not None:
+                progress = tqdm(total=total, unit="B", unit_scale=True, desc=f"Downloading {destination.name}")
+
+            chunk_size = 1024 * 1024
+            while True:
+                chunk = response.read(chunk_size)
+                if not chunk:
+                    break
+                output.write(chunk)
+                if progress is not None:
+                    progress.update(len(chunk))
+    except Exception as error:  # noqa: B902
+        if tmp_destination.exists():
+            tmp_destination.unlink()
+        if isinstance(error, (HTTPError, URLError)):
+            raise RuntimeError(f"Failed to download dataset from {url}.") from error
+        raise error
+    finally:
+        if progress is not None:
+            progress.close()
+
+    if not _is_hdf5_file(tmp_destination):
+        tmp_destination.unlink(missing_ok=True)
+        raise RuntimeError(
+            f"Downloaded file from {url} is not a valid HDF5 file. " "Please check network access and dataset URL."
+        )
+
+    tmp_destination.replace(destination)
+
+
+def _resolve_dataset_path(metadata: DatasetMetadata, path: str | Path | None = None) -> Path:
+    base_dir = _default_datasets_dir() if path is None else Path(path)
+    if base_dir.suffix:
+        return base_dir
+    return base_dir / metadata.filename
+
+
+def _fetch_dataset_file(
+    metadata: DatasetMetadata, path: str | Path | None = None, force_download: bool = False
+) -> Path:
+    dataset_path = _resolve_dataset_path(metadata, path=path)
+    should_download = force_download or not dataset_path.exists() or not _is_hdf5_file(dataset_path)
+    if should_download:
+        dataset_path.unlink(missing_ok=True)
+        _download_file(metadata.url, dataset_path)
+    return dataset_path
+
+
+def codex_mouse_spleen(path: str | Path | None = None, force_download: bool = False) -> ad.AnnData:
+    """Pre-processed CODEX dataset of mouse spleen from `Goltsev et al <https://doi.org/10.1016/j.cell.2018.07.010>`__."""
+    dataset_path = _fetch_dataset_file(_codex_mouse_spleen, path=path, force_download=force_download)
+    return ad.read_h5ad(dataset_path)
 
 
-__all__ = ["codex_mouse_spleen"]  # noqa: F822
+__all__ = ["codex_mouse_spleen"]
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,5 +1,7 @@
 import time
-from urllib.error import HTTPError
+from pathlib import Path
+from urllib.error import HTTPError, URLError
+from urllib.request import Request, urlopen
 
 import anndata as ad
 import numpy as np
@@ -9,6 +11,35 @@
 
 _adata = sc.read("tests/_data/test_data.h5ad")
 _adata.raw = _adata.copy()
+_CODEX_PATH = Path("tests/_data/codex_adata.h5ad")
+_CODEX_URL = "https://ndownloader.figshare.com/files/46832722"
+
+
+def _is_hdf5_file(path: Path) -> bool:
+    if not path.exists() or path.stat().st_size < 8:
+        return False
+    with path.open("rb") as input_file:
+        return input_file.read(8) == b"\x89HDF\r\n\x1a\n"
+
+
+def _download_codex(path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    tmp_path = path.with_suffix(f"{path.suffix}.part")
+    request = Request(_CODEX_URL, headers={"User-Agent": "cellcharter-tests"})
+
+    with urlopen(request, timeout=60) as response, tmp_path.open("wb") as output:
+        chunk_size = 1024 * 1024
+        while True:
+            chunk = response.read(chunk_size)
+            if not chunk:
+                break
+            output.write(chunk)
+
+    if not _is_hdf5_file(tmp_path):
+        tmp_path.unlink(missing_ok=True)
+        raise OSError("Downloaded codex fixture is not a valid HDF5 file.")
+
+    tmp_path.replace(path)
 
 
 @pytest.fixture()
@@ -31,12 +62,14 @@ def codex_adata() -> ad.AnnData:
 
     for attempt in range(max_retries):
         try:
-            adata = sc.read(
-                "tests/_data/codex_adata.h5ad", backup_url="https://figshare.com/ndownloader/files/46832722"
-            )
+            if not _is_hdf5_file(_CODEX_PATH):
+                _CODEX_PATH.unlink(missing_ok=True)
+                _download_codex(_CODEX_PATH)
+            adata = ad.read_h5ad(_CODEX_PATH)
             adata.obs_names_make_unique()
             return adata[adata.obs["sample"].isin(["BALBc-1", "MRL-5"])].copy()
-        except HTTPError as e:
+        except (HTTPError, URLError, OSError) as e:
+            _CODEX_PATH.unlink(missing_ok=True)  # Force re-download on next attempt
             if attempt == max_retries - 1:  # Last attempt
                 pytest.skip(f"Failed to download test data after {max_retries} attempts: {str(e)}")
             time.sleep(retry_delay)