Skip to content

Commit 6f3e9aa

Browse files
authored
Feature: Fill time gaps in from_pandas(..., format="long") with new boolean argument fill_time_gaps (#229)
* feature fill timegaps * fill only value variable with nan * start from 0 as mentioned in docstring * changelog
1 parent 7a7f4b3 commit 6f3e9aa

3 files changed

Lines changed: 58 additions & 1 deletion

File tree

CHANGELOG.md

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,17 @@ and this project adheres to [Semantic Versioning][].
1010

1111
## [0.1.2]
1212

13+
### Added
14+
- {func}`~ehrdata.io.from_pandas` with `format='long'` provides a new keyword argument `fill_time_gaps` that fills missing timegaps in the common case of integer time steps from 0 to n_timesteps ([#229](https://github.com/theislab/ehrdata/pull/229)) @eroell
15+
16+
### Modified
17+
- {func}`~ehrdata.dt.mimic_2` column `censor_flg` switched to lifeline's convention with 1=event, 0=censored, before this dataset loader function had them vice versa since the dataset provides them as such originally. ([#227](https://github.com/theislab/ehrdata/pull/227)) @sueoglu
18+
1319
### Fixed
14-
- {func}`~ehrdata.move_to_obs` with `format='long'` misordered entries in `.X`/`.layers` with `.obs` if the input df was not sorted for the obs id keys, which is now fixed. ([#228](https://github.com/theislab/ehrdata/pull/228)) @eroell
20+
- {func}`~ehrdata.io.from_pandas` with `format='long'` misordered entries in `.X`/`.layers` with `.obs` if the input df was not sorted for the obs id keys, which is now fixed. ([#228](https://github.com/theislab/ehrdata/pull/228)) @eroell
21+
22+
### Documentation
23+
- Documentation style polishing ([#223](https://github.com/theislab/ehrdata/pull/223)) @zethson
1524

1625
## [0.1.1]
1726

src/ehrdata/io/pandas.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def from_pandas(
2626
wide_format_time_suffix: str | None = None,
2727
long_format_keys: dict[Literal["observation_column", "variable_column", "time_column", "value_column"], str]
2828
| None = None,
29+
fill_time_gaps: bool = False,
2930
) -> EHRData:
3031
"""Transform a given :class:`~pandas.DataFrame` into an :class:`~ehrdata.EHRData` object.
3132
@@ -53,6 +54,11 @@ def from_pandas(
5354
"variable_column": "<the column name of the variable ids>",
5455
"time_column": "<the column name of the time>",
5556
"value_column": "<the column name of the values>"}`.
57+
fill_time_gaps: Use only if `format="long"`.
58+
If `True`, fills gaps in the numeric time axis with NaN values so that the 3rd dimension is a
59+
continuous integer range from 0 to the maximum time value.
60+
For example, if the data contains time indices ``[0, 1, 2, 5]``, the resulting time axis will be
61+
``[0, 1, 2, 3, 4, 5]`` with NaN values at indices 3 and 4 for all observations and variables.
5662
5763
Examples:
5864
>>> import ehrdata as ed
@@ -137,6 +143,10 @@ def from_pandas(
137143
err_msg = f"Invalid keys: {invalid_keys}. Please use only the following keys: {valid_long_format_keys}."
138144
raise ValueError(err_msg)
139145

146+
if fill_time_gaps and format != "long":
147+
err_msg = "fill_time_gaps should only be used if format is 'long'."
148+
raise ValueError(err_msg)
149+
140150
if format != "wide" and wide_format_time_suffix is not None:
141151
err_msg = "wide_format_time_suffix should only be used if format is 'wide'."
142152
raise ValueError(err_msg)
@@ -257,6 +267,16 @@ def from_pandas(
257267
]
258268
).to_xarray()
259269

270+
if fill_time_gaps:
271+
time_key = long_format_keys["time_column"]
272+
current_times = xr_dataarray[time_key].values
273+
full_range = np.arange(0, int(current_times.max()) + 1)
274+
# Select only the value variable before reindexing to avoid dtype conflicts
275+
# (e.g. datetime64 columns can't be filled with np.nan)
276+
xr_dataarray = xr_dataarray[[long_format_keys["value_column"]]].reindex(
277+
{time_key: full_range}, fill_value=np.nan
278+
)
279+
260280
tem_layer = xr_dataarray[long_format_keys["value_column"]].values
261281

262282
# xarray sorts the coordinates, so obs must be reindexed to match

tests/io/test_pandas.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,34 @@ def test_from_pandas_longitudinal_long_index_column_not_implemented():
327327
from_pandas(df, layer=DEFAULT_TEM_LAYER_NAME, index_column="observation_id", format="long")
328328

329329

330+
def test_from_pandas_longitudinal_long_fill_time_gaps():
331+
df = pd.DataFrame(
332+
{
333+
"observation_id": ["p1", "p1", "p1", "p1", "p2", "p2", "p2", "p2"],
334+
"variable": ["v1", "v1", "v2", "v2", "v1", "v1", "v2", "v2"],
335+
"time": [0, 2, 0, 2, 0, 2, 0, 2],
336+
"value": [1.0, 3.0, 4.0, 6.0, 7.0, 9.0, 10.0, 12.0],
337+
}
338+
)
339+
edata = from_pandas(df, layer=DEFAULT_TEM_LAYER_NAME, format="long", fill_time_gaps=True)
340+
_assert_shape_matches(edata, (2, 2, 3), check_X_None=True)
341+
342+
assert np.array_equal(edata.tem.index.values, ["0", "1", "2"])
343+
344+
# time=1 should be NaN for all observations and variables
345+
assert np.all(np.isnan(edata.layers[DEFAULT_TEM_LAYER_NAME][:, :, 1]))
346+
347+
# time=0 and time=2 should have the original values
348+
np.testing.assert_array_equal(edata.layers[DEFAULT_TEM_LAYER_NAME][0, 0, :], [1.0, np.nan, 3.0])
349+
np.testing.assert_array_equal(edata.layers[DEFAULT_TEM_LAYER_NAME][0, 1, :], [4.0, np.nan, 6.0])
350+
351+
352+
def test_from_pandas_fill_time_gaps_wrong_format():
353+
df = pd.DataFrame({"a": [1]})
354+
with pytest.raises(ValueError, match="fill_time_gaps"):
355+
from_pandas(df, format="flat", fill_time_gaps=True)
356+
357+
330358
def test_to_pandas_longitudinal_wide(edata_333):
331359
df = to_pandas(edata_333, layer=DEFAULT_TEM_LAYER_NAME, format="wide")
332360
assert df.shape == (3, 9)

0 commit comments

Comments
 (0)