Skip to content

Commit 02fb2cc

Browse files
fix: raise ValueError when combat batch has fewer than 2 cells (#3994)
Co-authored-by: Philipp A. <flying-sheep@web.de>
1 parent a5e5761 commit 02fb2cc

3 files changed

Lines changed: 35 additions & 6 deletions

File tree

docs/release-notes/3994.fix.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{func}`scanpy.pp.combat` now raises a {class}`ValueError` when a batch contains fewer than 2 cells, instead of silently producing NaN values in the corrected data {smaller}`L Zhang`

src/scanpy/preprocessing/_combat.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -202,10 +202,22 @@ def combat( # noqa: PLR0915
202202
sanitize_anndata(adata)
203203

204204
# construct a pandas series of the batch annotation
205-
model = adata.obs[[key, *(covariates if covariates else [])]]
206-
batch_info = model.groupby(key, observed=True).indices.values()
205+
model: pd.DataFrame = adata.obs[[key, *(covariates if covariates else [])]]
206+
batch_info = model.groupby(key, observed=True).indices
207207
n_batch = len(batch_info)
208-
n_batches = np.array([len(v) for v in batch_info])
208+
n_batches = np.array([len(v) for v in batch_info.values()])
209+
210+
# check for batches with fewer than 2 cells
211+
small_batches = [
212+
batch for batch, size in zip(batch_info, n_batches, strict=True) if size < 2
213+
]
214+
if small_batches:
215+
msg = (
216+
f"Batches {small_batches!r} have fewer than 2 cells. "
217+
"ComBat requires at least 2 cells per batch to estimate "
218+
"within-batch variance. Filter these batches before running combat."
219+
)
220+
raise ValueError(msg)
209221
n_array = float(sum(n_batches))
210222

211223
# standardize across genes using a pooled variance estimator
@@ -220,7 +232,9 @@ def combat( # noqa: PLR0915
220232
la.inv(batch_design.T @ batch_design) @ batch_design.T @ s_data.T
221233
).values
222234
# first estimate for the multiplicative batch effect
223-
delta_hat = [s_data.iloc[:, batch_idxs].var(axis=1) for batch_idxs in batch_info]
235+
delta_hat = [
236+
s_data.iloc[:, batch_idxs].var(axis=1) for batch_idxs in batch_info.values()
237+
]
224238

225239
# empirically fix the prior hyperparameters
226240
gamma_bar = gamma_hat.mean(axis=1)
@@ -233,7 +247,7 @@ def combat( # noqa: PLR0915
233247
# gamma star and delta star will be our empirical bayes (EB) estimators
234248
# for the additive and multiplicative batch effect per batch and cell
235249
gamma_star, delta_star = [], []
236-
for i, batch_idxs in enumerate(batch_info):
250+
for i, batch_idxs in enumerate(batch_info.values()):
237251
# temp stores our estimates for the batch effect parameters.
238252
# temp[0] is the additive batch effect
239253
# temp[1] is the multiplicative batch effect
@@ -257,7 +271,7 @@ def combat( # noqa: PLR0915
257271

258272
# we now apply the parametric adjustment to the standardized data from above
259273
# loop over all batches in the data
260-
for j, batch_idxs in enumerate(batch_info):
274+
for j, batch_idxs in enumerate(batch_info.values()):
261275
# we basically subtract the additive batch effect, rescale by the ratio
262276
# of multiplicative batch effect to pooled variance and add the overall gene
263277
# wise mean

tests/test_combat.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,20 @@ def test_combat_obs_names():
7575
assert_equal(a, b)
7676

7777

78+
def test_combat_single_cell_batch():
79+
"""Test that combat raises an error when a batch has fewer than 2 cells.
80+
81+
Regression test for https://github.com/scverse/scanpy/issues/1175
82+
"""
83+
adata = sc.datasets.blobs()
84+
# Create a batch where one category has only 1 cell
85+
batch = pd.Categorical(["single"] + ["other"] * (adata.n_obs - 1))
86+
adata.obs["batch"] = batch
87+
88+
with pytest.raises(ValueError, match="fewer than 2 cells"):
89+
sc.pp.combat(adata, key="batch")
90+
91+
7892
def test_silhouette():
7993
# this test checks wether combat can align data from several gaussians
8094
# it checks this by computing the silhouette coefficient in a pca embedding

0 commit comments

Comments
 (0)