fix (#228)

eroell · web-flow · commit e4d3cf4c4401 · 2026-03-24T19:51:48.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,11 @@ and this project adheres to [Semantic Versioning][].
 [keep a changelog]: https://keepachangelog.com/en/1.0.0/
 [semantic versioning]: https://semver.org/spec/v2.0.0.html
 
+## [0.1.2]
+
+### Fixed
+ - {func}`~ehrdata.move_to_obs` with `format='long'` misordered entries in `.X`/`.layers` with `.obs` if the input df was not sorted for the obs id keys, which is now fixed. ([#228](https://github.com/theislab/ehrdata/pull/228)) @eroell
+
 ## [0.1.1]
 
 ### Added
diff --git a/src/ehrdata/io/pandas.py b/src/ehrdata/io/pandas.py
@@ -259,6 +259,10 @@ def from_pandas(
 
         tem_layer = xr_dataarray[long_format_keys["value_column"]].values
 
+        # xarray sorts the coordinates, so obs must be reindexed to match
+        xr_obs_order = xr_dataarray[long_format_keys["observation_column"]].values
+        obs = obs.reindex(xr_obs_order)
+
         var = pd.DataFrame(index=xr_dataarray[long_format_keys["variable_column"]].values)
         var.index = var.index.astype(str)
         tem = pd.DataFrame(index=xr_dataarray[long_format_keys["time_column"]].values)
diff --git a/tests/io/test_pandas.py b/tests/io/test_pandas.py
@@ -284,6 +284,30 @@ def test_from_pandas_longitudinal_long():
     assert np.array_equal(edata.tem.index.values, ["t1", "t2"])
 
 
+def test_from_pandas_longitudinal_long_unsorted_observations():
+    """Regression test: obs order must match the data, even when observations appear unsorted in the DataFrame."""
+    df = pd.DataFrame(
+        {
+            "observation_id": ["p2", "p2", "p2", "p2", "p3", "p3", "p3", "p3", "p1", "p1", "p1", "p1"],
+            "variable": ["HR", "HR", "temp", "temp", "HR", "HR", "temp", "temp", "HR", "HR", "temp", "temp"],
+            "time": ["t0", "t1", "t0", "t1", "t0", "t1", "t0", "t1", "t0", "t1", "t0", "t1"],
+            "value": [72, 78, 36.5, 37.1, 65, 68, 36.8, 36.9, 80, 90, 37.5, 38.2],
+        }
+    )
+    edata = from_pandas(df, layer=DEFAULT_TEM_LAYER_NAME, format="long")
+    _assert_shape_matches(edata, (3, 2, 2), check_X_None=True)
+
+    for obs_name in ["p1", "p2", "p3"]:
+        obs_idx = edata.obs_names.get_loc(obs_name)
+        expected = (
+            df[df["observation_id"] == obs_name]
+            .pivot(index="variable", columns="time", values="value")
+            .loc[edata.var_names]
+            .values
+        )
+        np.testing.assert_array_equal(edata.layers[DEFAULT_TEM_LAYER_NAME][obs_idx], expected)
+
+
 def test_from_pandas_invalid_format():
     df = pd.DataFrame()
     with pytest.raises(ValueError):