Skip to content

Commit 0a4a597

Browse files
Backport PR #4011 on branch 1.12.x (refactor: Move datasets to scverse AWS S3 urls) (#4021)
Co-authored-by: Lukas Heumos <lukas.heumos@posteo.net>
1 parent 08d8bda commit 0a4a597

5 files changed

Lines changed: 17 additions & 33 deletions

File tree

docs/release-notes/4011.feat.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Use scverse S3 Cloudfront URLs for datasets {smaller}`zethson`

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ classifiers = [
5252
dynamic = [ "version" ]
5353
dependencies = [
5454
"anndata>=0.10.8",
55+
"certifi",
5556
"fast-array-utils[accel,sparse]>=1.2.1",
5657
"h5py>=3.11",
5758
"joblib",

src/scanpy/_utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -847,7 +847,7 @@ def select_groups(
847847
return groups_order_subset, groups_masks_obs
848848

849849

850-
def check_presence_download(filename: Path, backup_url):
850+
def check_presence_download(filename: Path, backup_url: str):
851851
"""Check if file is present otherwise download."""
852852
if not filename.is_file():
853853
from ..readwrite import _download

src/scanpy/datasets/_datasets.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def burczynski06() -> AnnData:
130130
131131
"""
132132
filename = settings.datasetdir / "burczynski06/GDS1615_full.soft.gz"
133-
url = "ftp://ftp.ncbi.nlm.nih.gov/geo/datasets/GDS1nnn/GDS1615/soft/GDS1615_full.soft.gz"
133+
url = "https://exampledata.scverse.org/scanpy/GDS1615_full.soft.gz"
134134
return read(filename, backup_url=url)
135135

136136

@@ -206,7 +206,9 @@ def moignard15() -> AnnData:
206206
207207
"""
208208
filename = settings.datasetdir / "moignard15/nbt.3154-S3.xlsx"
209-
backup_url = "https://static-content.springer.com/esm/art%3A10.1038%2Fnbt.3154/MediaObjects/41587_2015_BFnbt3154_MOESM4_ESM.xlsx"
209+
backup_url = (
210+
"https://exampledata.scverse.org/scanpy/41587_2015_BFnbt3154_MOESM4_ESM.xlsx"
211+
)
210212
adata = read(filename, sheet="dCt_values.txt", backup_url=backup_url)
211213
# filter out 4 genes as in Haghverdi et al. (2016)
212214
gene_subset = ~np.isin(adata.var_names, ["Eif2b1", "Mrpl19", "Polr2a", "Ubc"])
@@ -241,9 +243,8 @@ def paul15() -> AnnData:
241243
242244
Non-logarithmized raw data.
243245
244-
The data has been sent out by Email from the Amit Lab. An R version for
245-
loading the data can be found `here
246-
<https://github.com/theislab/scAnalysisTutorial>`_.
246+
The data has been sent out by Email from the Amit Lab.
247+
An R version for loading the data can be found `here <https://github.com/theislab/scAnalysisTutorial>`_.
247248
248249
Returns
249250
-------
@@ -262,7 +263,7 @@ def paul15() -> AnnData:
262263

263264
filename = settings.datasetdir / "paul15/paul15.h5"
264265
filename.parent.mkdir(exist_ok=True)
265-
backup_url = "https://falexwolf.de/data/paul15.h5"
266+
backup_url = "https://exampledata.scverse.org/scanpy/paul15.h5"
266267
_utils.check_presence_download(filename, backup_url)
267268
with h5py.File(filename, "r") as f:
268269
# Coercing to float32 for backwards compatibility
@@ -332,8 +333,7 @@ def pbmc68k_reduced() -> AnnData:
332333
It was saved keeping only 724 cells and 221 highly variable genes.
333334
334335
The saved file contains the annotation of cell types (key: `'bulk_labels'`),
335-
UMAP coordinates, louvain clustering and gene rankings based on the
336-
`bulk_labels`.
336+
UMAP coordinates, louvain clustering and gene rankings based on the `bulk_labels`.
337337
338338
.. [#norm] Back when the dataset was created, :func:`~scanpy.pp.normalize_per_cell` was used instead.
339339
.. _PBMC 68k dataset: https://www.10xgenomics.com/datasets/fresh-68-k-pbm-cs-donor-a-1-standard-1-1-0
@@ -404,7 +404,7 @@ def pbmc3k() -> AnnData:
404404
var: 'gene_ids'
405405
406406
"""
407-
url = "https://falexwolf.de/data/pbmc3k_raw.h5ad"
407+
url = "https://exampledata.scverse.org/scanpy/pbmc3k_raw.h5ad"
408408
with warnings.catch_warnings():
409409
warnings.filterwarnings("ignore", category=OldFormatWarning)
410410
adata = read(settings.datasetdir / "pbmc3k_raw.h5ad", backup_url=url)
@@ -444,7 +444,7 @@ def pbmc3k_processed() -> AnnData:
444444
obsp: 'distances', 'connectivities'
445445
446446
""" # noqa: D401
447-
url = "https://raw.githubusercontent.com/chanzuckerberg/cellxgene/main/example-dataset/pbmc3k.h5ad"
447+
url = "https://exampledata.scverse.org/scanpy/pbmc3k.h5ad"
448448

449449
with warnings.catch_warnings():
450450
warnings.filterwarnings("ignore", category=OldFormatWarning)
@@ -475,7 +475,7 @@ def _download_visium_dataset(
475475
if base_dir is None:
476476
base_dir = settings.datasetdir
477477

478-
url_prefix = f"https://cf.10xgenomics.com/samples/spatial-exp/{spaceranger_version}/{sample_id}"
478+
url_prefix = f"https://exampledata.scverse.org/scanpy/visium/{spaceranger_version}/{sample_id}"
479479

480480
sample_dir = base_dir / sample_id
481481
sample_dir.mkdir(exist_ok=True)

src/scanpy/readwrite.py

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1061,9 +1061,10 @@ def _get_filename_from_key(key, ext=None) -> Path:
10611061

10621062

10631063
def _download(url: str, path: Path):
1064-
from urllib.error import URLError
1064+
from ssl import create_default_context
10651065
from urllib.request import Request, urlopen
10661066

1067+
from certifi import contents
10671068
from tqdm.auto import tqdm
10681069

10691070
blocksize = 1024 * 8
@@ -1072,26 +1073,7 @@ def _download(url: str, path: Path):
10721073
try:
10731074
req = Request(url, headers={"User-agent": "scanpy-user"})
10741075

1075-
try:
1076-
open_url = urlopen(req)
1077-
except URLError:
1078-
if not url.startswith("https://"):
1079-
raise # No need to try using certifi
1080-
1081-
msg = "Failed to open the url with default certificates."
1082-
try:
1083-
from certifi import where
1084-
except ImportError as e:
1085-
e.add_note(f"{msg} Please install `certifi` and try again.")
1086-
raise
1087-
else:
1088-
logg.warning(f"{msg} Trying to use certifi.")
1089-
1090-
from ssl import create_default_context
1091-
1092-
open_url = urlopen(req, context=create_default_context(cafile=where()))
1093-
1094-
with open_url as resp:
1076+
with urlopen(req, context=create_default_context(cadata=contents())) as resp:
10951077
total = resp.info().get("content-length", None)
10961078
with (
10971079
tqdm(

0 commit comments

Comments
 (0)