fix: differential abundance and aggregated posterior computations (#3789)

lordy5 · pre-commit-ci[bot] · ori-kron-wis · web-flow · commit 5dacc67d31ab · 2026-05-13T19:19:36.000+03:00
Fixes:
- Use the original anndata for aggregated posterior computation, not the
subset of the anndata. All cells in a sample should be considered when
computing the aggregated posterior (up to num_cells_posterior)
- Update the tests accordingly
- Correctly pass scales into torch distributions, not variances.
get_latent_representation returns variances, not scales, while the
aggregated posterior code previously expected scales, and so
accidentally passed variances into torch distributions (Normal and
Student's T expect scales instead)

---------

Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Ori Kronfeld &lt;ori.kronfeld@weizmann.ac.il&gt;
Co-authored-by: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
Co-authored-by: github-actions[bot] &lt;41898282+github-actions[bot]@users.noreply.github.com&gt;
Co-authored-by: claude &lt;claude@users.noreply.github.com&gt;
diff --git a/src/scvi/model/base/_vaemixin.py b/src/scvi/model/base/_vaemixin.py
@@ -401,12 +401,13 @@ def get_aggregated_posterior(
             indices = np.arange(adata.n_obs)
 
         dataloader = self._make_data_loader(adata=adata, indices=indices, batch_size=batch_size)
-        qu_loc, qu_scale = self.get_latent_representation(
+        qu_loc, qu_var = self.get_latent_representation(
             batch_size=batch_size, return_dist=True, dataloader=dataloader, give_mean=True
         )
 
         qu_loc = torch.tensor(qu_loc, device=self.device)  # (n_cells, n_latent_u)
-        qu_scale = torch.tensor(qu_scale, device=self.device)
+        qu_var = torch.tensor(qu_var, device=self.device)
+        qu_scale = torch.sqrt(qu_var)
 
         if dof is None:
             components = dist.Normal(qu_loc, qu_scale)
@@ -421,6 +422,7 @@ def get_aggregated_posterior(
     def differential_abundance(
         self,
         adata: AnnOrMuData | None = None,
+        adata_sub: AnnOrMuData | None = None,
         sample_key: str | None = None,
         batch_size: int = 128,
         num_cells_posterior: int | None = None,
@@ -434,8 +436,13 @@ def differential_abundance(
         Parameters
         ----------
         adata
+            The full data object used to compute each aggregated posterior.
+            Defaults to the AnnData object used to initialize the model.
+        adata_sub
             The data object to compute the differential abundance for.
-            For very large datasets, this should be a subset of the original data object.
+            For very large datasets, this should be used to pass in a subset of the full data
+            object. The aggregated posteriors are still computed from the full data object.
+            The resulting log_probs matrix is stored in adata_sub.obsm
         sample_key
             Key for the sample covariate.
         batch_size
@@ -451,14 +458,17 @@ def differential_abundance(
         from tqdm import tqdm
 
         adata = self._validate_anndata(adata)
+        if adata_sub is None:
+            adata_sub = adata
+        else:
+            adata_sub = self._validate_anndata(adata_sub)
 
-        # In case user passes in a subset of model's anndata
-        adata_dataloader = self._make_data_loader(adata=adata, batch_size=batch_size)
+        adata_dataloader = self._make_data_loader(adata=adata_sub, batch_size=batch_size)
         us = self.get_latent_representation(
             batch_size=batch_size, dataloader=adata_dataloader, give_mean=True
         )
         dataloader = torch.utils.data.DataLoader(us, batch_size=batch_size)
-        unique_samples = adata.obs[sample_key].unique()
+        unique_samples = adata_sub.obs[sample_key].unique()
 
         log_probs = []
         for sample_name in tqdm(unique_samples):
@@ -476,6 +486,7 @@ def differential_abundance(
             log_probs.append(torch.cat(log_probs_, axis=0).cpu().numpy())
 
         log_probs = np.array(log_probs).T
-        log_probs_df = pd.DataFrame(data=log_probs, index=adata.obs_names, columns=unique_samples)
-
-        adata.obsm["da_log_probs"] = log_probs_df
+        log_probs_df = pd.DataFrame(
+            data=log_probs, index=adata_sub.obs_names, columns=unique_samples
+        )
+        adata_sub.obsm["da_log_probs"] = log_probs_df
diff --git a/tests/model/test_differential_abundance.py b/tests/model/test_differential_abundance.py
@@ -160,7 +160,7 @@ def test_differential_abundance(model: VAEMixin, adata: AnnData, mdata: MuData,
 
         subset_indices = np.random.choice(adata.n_obs, adata.n_obs // 2, replace=False)
         adata_subset = adata[subset_indices, :].copy()
-        model.differential_abundance(adata_subset, **da_kwargs)
+        model.differential_abundance(adata, adata_subset, **da_kwargs)
         assert isinstance(adata_subset.obsm["da_log_probs"], pd.DataFrame)