fix: use urlretrieve for reliable download in concat_on_disk doctest (scverse#2280)

katosh · flying-sheep · ilan-gold · web-flow · commit aa31c44a5e27 · 2026-01-08T17:02:27.000Z
Co-authored-by: Philipp A. &lt;flying-sheep@web.de&gt;
Co-authored-by: Ilan Gold &lt;ilanbassgold@gmail.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -96,11 +96,10 @@ test-min = [
     "joblib",
     "boltons",
     "scanpy>=1.10",
-    # TODO: Is 1.0dev1 a real pre-release? https://pypi.org/project/httpx/#history
-    "httpx<1.0",         # For data downloading
     "dask[distributed]",
     "awkward>=2.6.3",
     "pyarrow",
+    "pooch",
     "anndata[dask]",
 ]
 
diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py
@@ -62,19 +62,25 @@ def read_lazy(
     Preparing example objects
 
     >>> import anndata as ad
-    >>> from urllib.request import urlretrieve
+    >>> import pooch
     >>> import scanpy as sc
     >>> base_url = "https://datasets.cellxgene.cziscience.com"
-    >>> def get_cellxgene_data(id_: str):
-    ...     out_path = sc.settings.datasetdir / f"{id_}.h5ad"
-    ...     if out_path.exists():
-    ...         return out_path
-    ...     file_url = f"{base_url}/{id_}.h5ad"
-    ...     sc.settings.datasetdir.mkdir(parents=True, exist_ok=True)
-    ...     urlretrieve(file_url, out_path)
-    ...     return out_path
-    >>> path_b_cells = get_cellxgene_data("a93eab58-3d82-4b61-8a2f-d7666dcdb7c4")
-    >>> path_fetal = get_cellxgene_data("d170ff04-6da0-4156-a719-f8e1bbefbf53")
+    >>> # To update hashes: pooch.retrieve(url, known_hash=None) prints the new hash
+    >>> def get_cellxgene_data(id_: str, hash_: str):
+    ...     return pooch.retrieve(
+    ...         f"{base_url}/{id_}.h5ad",
+    ...         known_hash=hash_,
+    ...         fname=f"{id_}.h5ad",
+    ...         path=sc.settings.datasetdir,
+    ...     )
+    >>> path_b_cells = get_cellxgene_data(
+    ...     "a93eab58-3d82-4b61-8a2f-d7666dcdb7c4",
+    ...     "sha256:dac90fe2aa8b78aee2c1fc963104592f8eff7b873ca21d01a51a5e416734651c",
+    ... )
+    >>> path_fetal = get_cellxgene_data(
+    ...     "d170ff04-6da0-4156-a719-f8e1bbefbf53",
+    ...     "sha256:d497eebca03533919877b6fc876e8c9d8ba063199ddc86dd9fbcb9d1d87a3622",
+    ... )
     >>> b_cells_adata = ad.experimental.read_lazy(path_b_cells)
     >>> fetal_adata = ad.experimental.read_lazy(path_fetal)
     >>> print(b_cells_adata)
diff --git a/src/anndata/experimental/merge.py b/src/anndata/experimental/merge.py
@@ -556,19 +556,25 @@ def concat_on_disk(  # noqa: PLR0913
 
     First, let’s get some “big” datasets with a compatible ``var`` axis:
 
-    >>> import httpx
+    >>> import pooch
     >>> import scanpy as sc
     >>> base_url = "https://datasets.cellxgene.cziscience.com"
-    >>> def get_cellxgene_data(id_: str):
-    ...     out_path = sc.settings.datasetdir / f'{id_}.h5ad'
-    ...     if out_path.exists():
-    ...         return out_path
-    ...     file_url = f"{base_url}/{id_}.h5ad"
-    ...     sc.settings.datasetdir.mkdir(parents=True, exist_ok=True)
-    ...     out_path.write_bytes(httpx.get(file_url).content)
-    ...     return out_path
-    >>> path_b_cells = get_cellxgene_data('a93eab58-3d82-4b61-8a2f-d7666dcdb7c4')
-    >>> path_fetal = get_cellxgene_data('d170ff04-6da0-4156-a719-f8e1bbefbf53')
+    >>> # To update hashes: pooch.retrieve(url, known_hash=None) prints the new hash
+    >>> def get_cellxgene_data(id_: str, hash_: str):
+    ...     return pooch.retrieve(
+    ...         f"{base_url}/{id_}.h5ad",
+    ...         known_hash=hash_,
+    ...         fname=f"{id_}.h5ad",
+    ...         path=sc.settings.datasetdir,
+    ...     )
+    >>> path_b_cells = get_cellxgene_data(
+    ...     'a93eab58-3d82-4b61-8a2f-d7666dcdb7c4',
+    ...     'sha256:dac90fe2aa8b78aee2c1fc963104592f8eff7b873ca21d01a51a5e416734651c',
+    ... )
+    >>> path_fetal = get_cellxgene_data(
+    ...     'd170ff04-6da0-4156-a719-f8e1bbefbf53',
+    ...     'sha256:d497eebca03533919877b6fc876e8c9d8ba063199ddc86dd9fbcb9d1d87a3622',
+    ... )
 
     Now we can concatenate them on-disk: