scverse · felix0097 · Oct 1, 2025 · Oct 2, 2025 · Oct 6, 2025 · Oct 6, 2025
diff --git a/docs/notebooks/example.ipynb b/docs/notebooks/example.ipynb
@@ -3,9 +3,224 @@
   {
    "cell_type": "markdown",
    "metadata": {},
+   "source": "# Quickstart `annbatch`"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "This notebook will walk you through the following steps:\n",
+    "1. How to convert an existing collection of `h5ad` files into `annbatch` format\n",
+    "2. How to load the converted dataset using `annbatch`"
+   ]
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "# Download an example dataset from CELLxGENE\n",
+    "!wget https://datasets.cellxgene.cziscience.com/866d7d5e-436b-4dbd-b7c1-7696487d452e.h5ad"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "## IMPORTANT: Configure zarrs\n",
+    "\n",
+    "This step is both required for converting existing `h5ad` files into `annbatch` format as well as for the data loading part."
+   ]
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
    "source": [
-    "# Example notebook"
+    "import zarr\n",
+    "import zarrs  # noqa\n",
+    "\n",
+    "zarr.config.set(\n",
+    "    {\n",
+    "        \"threading.max_workers\": 5,\n",
+    "        \"codec_pipeline.path\": \"zarrs.ZarrsCodecPipeline\",\n",
+    "        \"concurrency\": 4,\n",
+    "    }\n",
+    ")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "import warnings\n",
+    "\n",
+    "# Suppress zarr vlen-utf8 codec warnings\n",
+    "warnings.filterwarnings(\n",
+    "    \"ignore\",\n",
+    "    message=\"The codec `vlen-utf8` is currently not part in the Zarr format 3 specification.*\",\n",
+    "    category=UserWarning,\n",
+    "    module=\"zarr.codecs.vlen_utf8\",\n",
+    ")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## Converting existing h5ad files into `annbatch` format"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "The conversion code will take care of the following things:\n",
+    "* Align the gene spaces across all datasets listed in `adata_paths`\n",
+    "  * The gene spaces are aligned based on the gene names provided in the `var_names` field of the individual `AnnData` objects.\n",
+    "  * If you want to subset to specific gene space, you can provide a list of gene names via the `var_subset` parameter.\n",
+    "* Shuffle the cells across all datasets (this works on larger than memory datasets as well).\n",
+    "  * This is important for block-wise shuffling during data loading.\n",
+    "  * This can be disabled by setting `shuffle=False`\n",
+    "* Shard the combined dataset across multiple datasets:\n",
+    "  * The size of each shard can be controlled via the `n_obs_per_dataset` parameter.\n",
+    "  * We recommend to choose a shard size that comfortably fits into system memory."
    ]
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "from arrayloaders import create_anndata_collection\n",
+    "\n",
+    "create_anndata_collection(\n",
+    "    # List all the h5ad files you want to include in the collection\n",
+    "    adata_paths=[\n",
+    "        \"866d7d5e-436b-4dbd-b7c1-7696487d452e.h5ad\",\n",
+    "    ],\n",
+    "    # Path to store the output collection\n",
+    "    output_path=\"tahoe100_FULL\",\n",
+    "    shuffle=True,  # Whether to pre-shuffle the cells of the collection\n",
+    "    n_obs_per_dataset=2_097_152,  # Number of cells per dataset shard\n",
+    "    var_subset=None,  # Optionally subset the collection to a specific gene space\n",
+    "    should_denseify=False,\n",
+    ")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## Data loading example"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "COLLECTION_PATH = Path(\"tahoe100_FULL/\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "import anndata as ad\n",
+    "\n",
+    "from arrayloaders import ZarrSparseDataset\n",
+    "\n",
+    "ds = ZarrSparseDataset(\n",
+    "    batch_size=4096,  # Number of cells per yielded batch\n",
+    "    chunk_size=256,  # Number of cells to load from disk contiguously - default settings should work well\n",
+    "    preload_nchunks=32,  # Number of chunks to preload in the background + shuffle - default settings should work well\n",
+    ")\n",
+    "\n",
+    "# Add dataset that should be used for training\n",
+    "ds.add_anndatas(\n",
+    "    [\n",
+    "        ad.AnnData(\n",
+    "            X=ad.io.sparse_dataset(zarr.open(p)[\"X\"]),\n",
+    "            obs=ad.io.read_elem(zarr.open(p)[\"obs\"]),\n",
+    "        )\n",
+    "        for p in COLLECTION_PATH.glob(\"*.zarr\")\n",
+    "    ],\n",
+    "    obs_keys=\"cell_type\",\n",
+    ")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "IMPORTANT:\n",
+    "* The `ZarrSparseDataset` yields batches of sparse tensors.\n",
+    "* The conversion to dense tensors should be done on the GPU, as shown in the example below.\n",
+    "  * First call `.cuda()` and then `.to_dense()`\n",
+    "  * E.g. `x = x.cuda().to_dense()`\n",
+    "  * This is significantly faster than doing the dense conversion on the CPU.\n"
+   ]
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "# Iterate over dataloader\n",
+    "for batch in ds:\n",
+    "    x, obs = batch\n",
+    "    # Important: Convert to dense on GPU\n",
+    "    x = x.cuda().to_dense()\n",
+    "    # Feed data into your model\n",
+    "    ..."
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## Optional: Extend an existing collection with a new dataset"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "You might want to extend an existing pre-shuffled collection with a new dataset.\n",
+    "This can be done using the `add_to_collection` function.\n",
+    "\n",
+    "This function will take care of shuffling the new dataset into the existing collection without having to re-shuffle the entire collection."
+   ]
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "from arrayloaders import add_to_collection\n",
+    "\n",
+    "add_to_collection(\n",
+    "    adata_paths=[\n",
+    "        \"866d7d5e-436b-4dbd-b7c1-7696487d452e.h5ad\",\n",
+    "    ],\n",
+    "    output_path=\"tahoe100_FULL\",\n",
+    "    read_full_anndatas=True,  # This should be set to False if the new datasets DO NOT fit into memory\n",
+    ")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": "",
+   "outputs": [],
+   "execution_count": null
   }
  ],
  "metadata": {