Skip to content

Commit e4d3cf4

Browse files
authored
fix (#228)
1 parent 35a14e8 commit e4d3cf4

3 files changed

Lines changed: 33 additions & 0 deletions

File tree

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ and this project adheres to [Semantic Versioning][].
88
[keep a changelog]: https://keepachangelog.com/en/1.0.0/
99
[semantic versioning]: https://semver.org/spec/v2.0.0.html
1010

11+
## [0.1.2]
12+
13+
### Fixed
14+
- {func}`~ehrdata.move_to_obs` with `format='long'` misordered entries in `.X`/`.layers` with `.obs` if the input df was not sorted for the obs id keys, which is now fixed. ([#228](https://github.com/theislab/ehrdata/pull/228)) @eroell
15+
1116
## [0.1.1]
1217

1318
### Added

src/ehrdata/io/pandas.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,10 @@ def from_pandas(
259259

260260
tem_layer = xr_dataarray[long_format_keys["value_column"]].values
261261

262+
# xarray sorts the coordinates, so obs must be reindexed to match
263+
xr_obs_order = xr_dataarray[long_format_keys["observation_column"]].values
264+
obs = obs.reindex(xr_obs_order)
265+
262266
var = pd.DataFrame(index=xr_dataarray[long_format_keys["variable_column"]].values)
263267
var.index = var.index.astype(str)
264268
tem = pd.DataFrame(index=xr_dataarray[long_format_keys["time_column"]].values)

tests/io/test_pandas.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,30 @@ def test_from_pandas_longitudinal_long():
284284
assert np.array_equal(edata.tem.index.values, ["t1", "t2"])
285285

286286

287+
def test_from_pandas_longitudinal_long_unsorted_observations():
288+
"""Regression test: obs order must match the data, even when observations appear unsorted in the DataFrame."""
289+
df = pd.DataFrame(
290+
{
291+
"observation_id": ["p2", "p2", "p2", "p2", "p3", "p3", "p3", "p3", "p1", "p1", "p1", "p1"],
292+
"variable": ["HR", "HR", "temp", "temp", "HR", "HR", "temp", "temp", "HR", "HR", "temp", "temp"],
293+
"time": ["t0", "t1", "t0", "t1", "t0", "t1", "t0", "t1", "t0", "t1", "t0", "t1"],
294+
"value": [72, 78, 36.5, 37.1, 65, 68, 36.8, 36.9, 80, 90, 37.5, 38.2],
295+
}
296+
)
297+
edata = from_pandas(df, layer=DEFAULT_TEM_LAYER_NAME, format="long")
298+
_assert_shape_matches(edata, (3, 2, 2), check_X_None=True)
299+
300+
for obs_name in ["p1", "p2", "p3"]:
301+
obs_idx = edata.obs_names.get_loc(obs_name)
302+
expected = (
303+
df[df["observation_id"] == obs_name]
304+
.pivot(index="variable", columns="time", values="value")
305+
.loc[edata.var_names]
306+
.values
307+
)
308+
np.testing.assert_array_equal(edata.layers[DEFAULT_TEM_LAYER_NAME][obs_idx], expected)
309+
310+
287311
def test_from_pandas_invalid_format():
288312
df = pd.DataFrame()
289313
with pytest.raises(ValueError):

0 commit comments

Comments
 (0)