update example notebook

florianingelfinger · web-flow · commit 7b77adb0ec10 · 2025-04-23T11:15:04.000+02:00
diff --git a/docs/notebooks/Basic_CytoVI_workflow.ipynb b/docs/notebooks/Basic_CytoVI_workflow.ipynb
@@ -4,7 +4,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# basic CytoVI workflow"
+    "# Basic notebook to train a CytoVI model for the integration of flow cytometry data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the following notebook we demonstrate the basic use of CytoVI and how a CytoVI model can be trained to integrate full spectrum cytometry data from two distinct batches. We first import fcs files and store them in the commonly used anndata format for single cell analyses, preprocess the data and do QC checks, train a CytoVI model and inspect the batch corrected latent space of CytoVI. Consecutively, we cluster the CytoVI latent space, anotate the clusters and compute batch-corrected expression estimates. Subsequently, common downstream analysis tasks can be performed such as quantification the relative abundance of cell populations between groups of samples, differential expression analysis or trajectory inference. For more advanced applications of CytoVI we refer to the reproducibility repository and the documentation. Advanced tutorials will be published with the official release of CytoVI into scvi-tools."
    ]
   },
   {
@@ -25,7 +32,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# read data\n",
+    "# read the flow cytometry data of the two batches and store the raw expression values in adata.layers\n",
     "adata_batch1 = readfcs.read('../data/raw/Spectral flow/Nunez/For Chiquito/Raw_100000/batch1')\n",
     "adata_batch1.layers['raw'] = adata_batch1.X.copy()\n",
     "\n",
@@ -39,7 +46,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# preprocess\n",
+    "# preprocess the raw flow cytometry data; we first perform a hyperbolic arcsin transformation, followed by feature-wise min-max scaling\n",
+    "# as a starting point we recommend a global scaling factor of 2000 for full spectrum cytometry data, 100 for conventional flow cytometry data and\n",
+    "# 5 for mass cytometry data; the resulting distribution of scaled expresssion can be visualized in scatter plots and histograms (see below)\n",
     "cytovi.pp.arcsinh(adata_batch1, global_scaling_factor=2000)\n",
     "cytovi.pp.scale(adata_batch1)\n",
     "\n",
@@ -63,9 +72,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# do some plotting\n",
-    "cytovi.pl.histogram(adata, marker = 'all', groupby='batch', layer_key='transformed')\n",
-    "cytovi.pl.biaxial(adata, marker_x = 'CD3', marker_y = 'CD4', groupby='batch', layer_key='transformed')"
+    "# inspect the expression values in the scaled layer across the different batches\n",
+    "cytovi.pl.histogram(adata, marker = 'all', groupby='batch', layer_key='scaled')\n",
+    "cytovi.pl.biaxial(adata, marker_x = 'CD3', marker_y = 'CD4', color='batch', layer_key='scaled')"
    ]
   },
   {
@@ -74,7 +83,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# train model\n",
+    "# optional: subsample the anndata in equal proportions for each batch to speed up training for the purpose of this tutorial\n",
+    "adata = cytovi.pp.subsample(adata, groupby='batch', n_obs=10000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# register the anndata and train the CytoVI model controlling for the 'batch' covariate\n",
     "cytovi.CytoVI.setup_anndata(adata, layer='scaled', batch_key='batch')\n",
     "model = cytovi.CytoVI(adata)\n",
     "model.train()"
@@ -86,7 +105,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# save model on disk for later use\n",
+    "# optional: save model on disk for later use\n",
     "model_path = '../saved_models/'\n",
     "model.save(f'{model_path}my_cytovi_model', overwrite=True)"
    ]
@@ -97,7 +116,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# plot training dynamics\n",
+    "# optional: load model from disk\n",
+    "model = cytovi.CytoVI.load(f'{model_path}my_cytovi_model', adata=adata)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# plot training dynamics, note: here we want to observe a plateau of the ELBO\n",
     "plt.subplot(1, 2, 1)\n",
     "plt.plot(model.history['elbo_train'])\n",
     "plt.xlabel('epochs')\n",
@@ -106,7 +135,8 @@
     "plt.subplot(1, 2, 2)\n",
     "plt.plot(model.history['elbo_validation'])\n",
     "plt.xlabel('epochs')\n",
-    "plt.ylabel('elbo_validation')"
+    "plt.ylabel('elbo_validation')\n",
+    "plt.show()"
    ]
   },
   {
@@ -115,7 +145,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# compute umap and cluster CytoVI latent space\n",
+    "# compute umap of the CytoVI latent space and cluster the latent space using leiden\n",
     "adata.obsm[\"X_CytoVI\"] = model.get_latent_representation()\n",
     "sc.pp.neighbors(adata, use_rep=\"X_CytoVI\")\n",
     "sc.tl.umap(adata)\n",
@@ -128,9 +158,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# plot data\n",
+    "# plot the the integrated CytoVI latent space and the corresponding clusters including their expression profile\n",
     "sc.pl.umap(adata, color=[\"leiden_CytoVI\", \"batch\"])\n",
-    "sc.pl.matrixplot(adata, var_names=adata.var_names, groupby=\"leiden_CytoVI\")"
+    "sc.pl.matrixplot(adata, var_names=adata.var_names, groupby=\"leiden_CytoVI\", layer='scaled', dendrogram=True, standard_scale='var')"
    ]
   },
   {
@@ -139,7 +169,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# annotate cells\n",
+    "# annotate the leiden clusters into cell types and visualize on the CytoVI latent space\n",
     "annot_dict = {\n",
     "    '0': 'Granulocytes',\n",
     "    '1': 'Monocytes',\n",
@@ -158,12 +188,22 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# get imputed expression, note: by default we return the mean across all batches, you can change this by setting the transform_batch argument\n",
+    "# get imputed batch-corrected expression; note: by default we return the mean across all batches, you can change this by setting the transform_batch argument\n",
     "adata.layers['imputed'] = model.get_normalized_expression(adata, n_samples = 10)\n",
     "\n",
     "# visualize imputed expression\n",
     "sc.pl.umap(adata, color = ['CD3', 'CD4', 'CD8', 'CD19', 'CD56'], layer='imputed')"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# optional: save the anndata\n",
+    "adata.write('../data/processed/Nunez_100k_annotated.h5ad')"
+   ]
   }
  ],
  "metadata": {