Feature: Fill time gaps in from_pandas(..., format="long") with new boolean argument fill_time_gaps (#229)

eroell · web-flow · commit 6f3e9aa7b0ea · 2026-03-24T20:37:41.000+01:00
* feature fill timegaps

* fill only value variable with nan

* start from 0 as mentioned in docstring

* changelog
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,8 +10,17 @@ and this project adheres to [Semantic Versioning][].
 
 ## [0.1.2]
 
+### Added
+ - {func}`~ehrdata.io.from_pandas` with `format='long'` provides a new keyword argument `fill_time_gaps` that fills missing timegaps in the common case of integer time steps from 0 to n_timesteps ([#229](https://github.com/theislab/ehrdata/pull/229)) @eroell
+
+### Modified
+ - {func}`~ehrdata.dt.mimic_2` column `censor_flg` switched to lifeline's convention with 1=event, 0=censored, before this dataset loader function had them vice versa since the dataset provides them as such originally. ([#227](https://github.com/theislab/ehrdata/pull/227)) @sueoglu
+
 ### Fixed
- - {func}`~ehrdata.move_to_obs` with `format='long'` misordered entries in `.X`/`.layers` with `.obs` if the input df was not sorted for the obs id keys, which is now fixed. ([#228](https://github.com/theislab/ehrdata/pull/228)) @eroell
+ - {func}`~ehrdata.io.from_pandas` with `format='long'` misordered entries in `.X`/`.layers` with `.obs` if the input df was not sorted for the obs id keys, which is now fixed. ([#228](https://github.com/theislab/ehrdata/pull/228)) @eroell
+
+ ### Documentation
+ - Documentation style polishing ([#223](https://github.com/theislab/ehrdata/pull/223)) @zethson
 
 ## [0.1.1]
 
diff --git a/src/ehrdata/io/pandas.py b/src/ehrdata/io/pandas.py
@@ -26,6 +26,7 @@ def from_pandas(
     wide_format_time_suffix: str | None = None,
     long_format_keys: dict[Literal["observation_column", "variable_column", "time_column", "value_column"], str]
     | None = None,
+    fill_time_gaps: bool = False,
 ) -> EHRData:
     """Transform a given :class:`~pandas.DataFrame` into an :class:`~ehrdata.EHRData` object.
 
@@ -53,6 +54,11 @@ def from_pandas(
             "variable_column": "<the column name of the variable ids>",
             "time_column": "<the column name of the time>",
             "value_column": "<the column name of the values>"}`.
+        fill_time_gaps: Use only if `format="long"`.
+            If `True`, fills gaps in the numeric time axis with NaN values so that the 3rd dimension is a
+            continuous integer range from 0 to the maximum time value.
+            For example, if the data contains time indices ``[0, 1, 2, 5]``, the resulting time axis will be
+            ``[0, 1, 2, 3, 4, 5]`` with NaN values at indices 3 and 4 for all observations and variables.
 
     Examples:
         >>> import ehrdata as ed
@@ -137,6 +143,10 @@ def from_pandas(
             err_msg = f"Invalid keys: {invalid_keys}. Please use only the following keys: {valid_long_format_keys}."
             raise ValueError(err_msg)
 
+    if fill_time_gaps and format != "long":
+        err_msg = "fill_time_gaps should only be used if format is 'long'."
+        raise ValueError(err_msg)
+
     if format != "wide" and wide_format_time_suffix is not None:
         err_msg = "wide_format_time_suffix should only be used if format is 'wide'."
         raise ValueError(err_msg)
@@ -257,6 +267,16 @@ def from_pandas(
             ]
         ).to_xarray()
 
+        if fill_time_gaps:
+            time_key = long_format_keys["time_column"]
+            current_times = xr_dataarray[time_key].values
+            full_range = np.arange(0, int(current_times.max()) + 1)
+            # Select only the value variable before reindexing to avoid dtype conflicts
+            # (e.g. datetime64 columns can't be filled with np.nan)
+            xr_dataarray = xr_dataarray[[long_format_keys["value_column"]]].reindex(
+                {time_key: full_range}, fill_value=np.nan
+            )
+
         tem_layer = xr_dataarray[long_format_keys["value_column"]].values
 
         # xarray sorts the coordinates, so obs must be reindexed to match
diff --git a/tests/io/test_pandas.py b/tests/io/test_pandas.py
@@ -327,6 +327,34 @@ def test_from_pandas_longitudinal_long_index_column_not_implemented():
         from_pandas(df, layer=DEFAULT_TEM_LAYER_NAME, index_column="observation_id", format="long")
 
 
+def test_from_pandas_longitudinal_long_fill_time_gaps():
+    df = pd.DataFrame(
+        {
+            "observation_id": ["p1", "p1", "p1", "p1", "p2", "p2", "p2", "p2"],
+            "variable": ["v1", "v1", "v2", "v2", "v1", "v1", "v2", "v2"],
+            "time": [0, 2, 0, 2, 0, 2, 0, 2],
+            "value": [1.0, 3.0, 4.0, 6.0, 7.0, 9.0, 10.0, 12.0],
+        }
+    )
+    edata = from_pandas(df, layer=DEFAULT_TEM_LAYER_NAME, format="long", fill_time_gaps=True)
+    _assert_shape_matches(edata, (2, 2, 3), check_X_None=True)
+
+    assert np.array_equal(edata.tem.index.values, ["0", "1", "2"])
+
+    # time=1 should be NaN for all observations and variables
+    assert np.all(np.isnan(edata.layers[DEFAULT_TEM_LAYER_NAME][:, :, 1]))
+
+    # time=0 and time=2 should have the original values
+    np.testing.assert_array_equal(edata.layers[DEFAULT_TEM_LAYER_NAME][0, 0, :], [1.0, np.nan, 3.0])
+    np.testing.assert_array_equal(edata.layers[DEFAULT_TEM_LAYER_NAME][0, 1, :], [4.0, np.nan, 6.0])
+
+
+def test_from_pandas_fill_time_gaps_wrong_format():
+    df = pd.DataFrame({"a": [1]})
+    with pytest.raises(ValueError, match="fill_time_gaps"):
+        from_pandas(df, format="flat", fill_time_gaps=True)
+
+
 def test_to_pandas_longitudinal_wide(edata_333):
     df = to_pandas(edata_333, layer=DEFAULT_TEM_LAYER_NAME, format="wide")
     assert df.shape == (3, 9)