Skip to content

Commit 4238647

Browse files
committed
fixed bug in concatenate, present since releases 0.5.2, 0.5.3, 0.5.4
1 parent 6715787 commit 4238647

File tree

3 files changed

+30
-13
lines changed

3 files changed

+30
-13
lines changed

anndata/base.py

+22-10
Original file line numberDiff line numberDiff line change
@@ -1301,9 +1301,12 @@ def copy(self, filename=None):
13011301
return AnnData(filename=filename)
13021302

13031303
def concatenate(self, *adatas, join='inner', batch_key='batch', batch_categories=None, index_unique=None):
1304-
"""Concatenate along the observations axis after intersecting the variables names.
1304+
"""Concatenate along the observations axis.
13051305
1306-
The `.var`, `.varm`, and `.uns` attributes of the passed adatas are ignored.
1306+
The `.uns` and `.varm` attributes of the passed `adatas` are ignored.
1307+
1308+
If you use `join='outer'`, then note that this fills 0s for data that is
1309+
non-present. Use this with care.
13071310
13081311
Parameters
13091312
----------
@@ -1337,7 +1340,7 @@ def concatenate(self, *adatas, join='inner', batch_key='batch', batch_categories
13371340
>>> {'anno2': ['d3', 'd4']},
13381341
>>> {'var_names': ['b', 'c', 'd']})
13391342
>>>
1340-
>>> adata = adata1.concatenate(adata2, adata3)
1343+
>>> adata = adata1.concatenate(adata2, adata3, index_unique='-')
13411344
>>> adata.X
13421345
[[ 2. 3.]
13431346
[ 5. 6.]
@@ -1372,9 +1375,17 @@ def concatenate(self, *adatas, join='inner', batch_key='batch', batch_categories
13721375
'Making variable names unique for controlled concatenation.')
13731376
printed_info = True
13741377

1378+
# define variable names of joint AnnData
13751379
mergers = dict(inner=set.intersection, outer=set.union)
1376-
var_names = pd.Index(reduce(mergers[join], (set(ad.var_names) for ad in all_adatas)))
1377-
1380+
var_names_reduce = reduce(mergers[join], (set(ad.var_names) for ad in all_adatas))
1381+
# restore order of initial var_names, append non-sortable names at the end
1382+
var_names = []
1383+
for v in all_adatas[0].var_names:
1384+
if v in var_names_reduce:
1385+
var_names.append(v)
1386+
var_names_reduce.remove(v) # update the set
1387+
var_names = pd.Index(var_names + list(var_names_reduce))
1388+
13781389
if batch_categories is None:
13791390
categories = [str(i) for i, _ in enumerate(all_adatas)]
13801391
elif len(batch_categories) == len(all_adatas):
@@ -1392,11 +1403,11 @@ def concatenate(self, *adatas, join='inner', batch_key='batch', batch_categories
13921403
obs_i = 0 # start of next adata’s observations in X
13931404
out_obss = []
13941405
for i, ad in enumerate(all_adatas):
1395-
vars_ad_in_res = var_names.isin(ad.var_names)
1396-
vars_res_in_ad = ad.var_names.isin(var_names)
1406+
vars_intersect = [v for v in var_names if v in ad.var_names]
13971407

13981408
# X
1399-
X[obs_i:obs_i+ad.n_obs, vars_ad_in_res] = ad.X[:, vars_res_in_ad]
1409+
X[obs_i:obs_i+ad.n_obs,
1410+
var_names.isin(vars_intersect)] = ad[:, vars_intersect].X
14001411
obs_i += ad.n_obs
14011412

14021413
# obs
@@ -1412,13 +1423,14 @@ def concatenate(self, *adatas, join='inner', batch_key='batch', batch_categories
14121423
out_obss.append(obs)
14131424

14141425
# var
1415-
var.loc[vars_ad_in_res, ad.var.columns] = ad.var.loc[vars_res_in_ad, :]
1426+
# potential add additional columns
1427+
var.loc[vars_intersect, ad.var.columns] = ad.var.loc[vars_intersect, :]
14161428

14171429
obs = pd.concat(out_obss)
14181430
uns = all_adatas[0].uns
14191431
obsm = np.concatenate([ad.obsm for ad in all_adatas])
14201432
varm = self.varm # TODO
1421-
1433+
14221434
new_adata = AnnData(X, obs, var, uns, obsm, None, filename=self.filename)
14231435
if not obs.index.is_unique:
14241436
logg.info(

anndata/tests/base.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -189,19 +189,20 @@ def test_concatenate():
189189
adata2 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]),
190190
{'obs_names': ['s3', 's4'],
191191
'anno1': ['c3', 'c4']},
192-
{'var_names': ['b', 'c', 'd']})
192+
{'var_names': ['d', 'c', 'b']})
193193
adata3 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]),
194194
{'obs_names': ['s5', 's6'],
195195
'anno2': ['d3', 'd4']},
196-
{'var_names': ['b', 'c', 'd']})
196+
{'var_names': ['d', 'c', 'b']})
197197
adata = adata1.concatenate(adata2, adata3)
198198
assert adata.n_vars == 2
199199
assert adata.obs_keys() == ['anno1', 'anno2', 'batch']
200200
adata = adata1.concatenate(adata2, adata3, batch_key='batch1')
201201
assert adata.obs_keys() == ['anno1', 'anno2', 'batch1']
202202
adata = adata1.concatenate(adata2, adata3, batch_categories=['a1', 'a2', 'a3'])
203203
assert adata.obs['batch'].cat.categories.tolist() == ['a1', 'a2', 'a3']
204-
204+
assert adata.var_names.tolist() == ['b', 'c']
205+
205206

206207
def test_concatenate_sparse():
207208
from scipy.sparse import csr_matrix

docs/release_notes.rst

+4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
See all releases `here <https://github.com/theislab/anndata/releases>`_. The following lists selected improvements.
22

3+
Warning: there has been a bug in :func:`~anndata.AnnData.concatenate` in
4+
versions 0.5.2, 0.5.3 and 0.5.4: variable names were not assigned correctly. Use
5+
version 0.5.5.
6+
37

48
**February 9, 2018**: version 0.5
59

0 commit comments

Comments
 (0)