optionally return event_id dict from read_raw_bids, and use value column from events file if present (mne-tools#1349)

drammock · web-flow · commit 46f284bf899e · 2024-12-10T09:41:42.000+01:00
* optionally return event_id from read_raw_bids

* drop n/a values when creating event dict

* drop NA-onset events early

* commments &amp; cleanup

* simplify

* bug

* clean up / strengthen test

* revert introduced bug

* changelog

* docstring
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -46,7 +46,7 @@ Detailed list of changes
 🪲 Bug fixes
 ^^^^^^^^^^^^
 
-- Nothing yet
+- :func:`mne_bids.read_raw_bids` can optionally return an ``event_id`` dictionary suitable for use with :func:`mne.events_from_annotations`, and if a ``values`` column is present in ``events.tsv`` it will be used as the source of the integer event ID codes, by `Daniel McCloy`_ (:gh:`1349`)
 
 ⚕️ Code health
 ^^^^^^^^^^^^^^
diff --git a/mne_bids/read.py b/mne_bids/read.py
@@ -527,89 +527,76 @@ def _handle_info_reading(sidecar_fname, raw):
 
 
 def _handle_events_reading(events_fname, raw):
-    """Read associated events.tsv and populate raw.
-
-    Handle onset, duration, and description of each event.
-    """
+    """Read associated events.tsv and convert valid events to annotations on Raw."""
     logger.info(f"Reading events from {events_fname}.")
     events_dict = _from_tsv(events_fname)
 
-    # Get the descriptions of the events
+    # drop events where onset is n/a
+    events_dict = _drop(events_dict, "n/a", "onset")
+
+    # Get event descriptions. Use `trial_type` column if available.
     if "trial_type" in events_dict:
         trial_type_col_name = "trial_type"
-    elif "stim_type" in events_dict:  # Backward-compat with old datasets.
+    # allow `stim_type` for backward-compat with old datasets.
+    elif "stim_type" in events_dict:
         trial_type_col_name = "stim_type"
         warn(
-            f'The events file, {events_fname}, contains a "stim_type" '
-            f'column. This column should be renamed to "trial_type" for '
-            f"BIDS compatibility."
+            f'The events file, {events_fname}, contains a "stim_type" column. This '
+            'column should be renamed to "trial_type" for BIDS compatibility.'
         )
+    # If we lack proper event descriptions, perhaps we have at least an event value?
+    elif "value" in events_dict:
+        trial_type_col_name = "value"
+    # Worst case: all events will become `n/a` and all values will be `1`
     else:
         trial_type_col_name = None
 
     if trial_type_col_name is not None:
         # Drop events unrelated to a trial type
         events_dict = _drop(events_dict, "n/a", trial_type_col_name)
-
+        trial_types = events_dict[trial_type_col_name]
+        # handle event values (if provided); ensure pairings are 1 value per description
         if "value" in events_dict:
-            # Check whether the `trial_type` <> `value` mapping is unique.
-            trial_types = events_dict[trial_type_col_name]
             values = np.asarray(events_dict["value"], dtype=str)
             for trial_type in np.unique(trial_types):
                 idx = np.where(trial_type == np.atleast_1d(trial_types))[0]
                 matching_values = values[idx]
-
                 if len(np.unique(matching_values)) > 1:
-                    # Event type descriptors are ambiguous; create hierarchical
-                    # event descriptors.
+                    # Event type descriptors are ambiguous; create hierarchical event
+                    # descriptors (to ensure trial_type -> integerID is 1:1)
                     logger.info(
-                        f'The event "{trial_type}" refers to multiple event '
-                        f"values. Creating hierarchical event names."
+                        f'The event "{trial_type}" refers to multiple event values.'
+                        "Creating hierarchical event names."
                     )
                     for ii in idx:
                         value = values[ii]
                         value = "na" if value == "n/a" else value
                         new_name = f"{trial_type}/{value}"
-                        logger.info(
-                            f"    Renaming event: {trial_type} -> " f"{new_name}"
-                        )
+                        logger.info(f"    Renaming event: {trial_type} -> {new_name}")
                         trial_types[ii] = new_name
-            descriptions = np.asarray(trial_types, dtype=str)
+            # drop rows where `value` is `n/a` & convert remaining `value` to int (only
+            # when making our `event_id` dict; `value = n/a` doesn't prevent annotation)
+            culled = _drop(events_dict, "n/a", "value")
+            event_id = dict(
+                zip(culled[trial_type_col_name], np.asarray(culled["value"], dtype=int))
+            )
         else:
-            descriptions = np.asarray(events_dict[trial_type_col_name], dtype=str)
-    elif "value" in events_dict:
-        # If we don't have a proper description of the events, perhaps we have
-        # at least an event value?
-        # Drop events unrelated to value
-        events_dict = _drop(events_dict, "n/a", "value")
-        descriptions = np.asarray(events_dict["value"], dtype=str)
+            event_id = dict(zip(trial_types, np.arange(len(trial_types))))
+        descrs = np.asarray(trial_types, dtype=str)
 
-    # Worst case, we go with 'n/a' for all events
+    # Worst case: all events become `n/a` and all values become `1`
     else:
-        descriptions = np.array(["n/a"] * len(events_dict["onset"]), dtype=str)
-
+        descrs = np.full(len(events_dict["onset"]), "n/a")
+        event_id = {descrs[0]: 1}
     # Deal with "n/a" strings before converting to float
-    onsets = np.array(
-        [np.nan if on == "n/a" else on for on in events_dict["onset"]], dtype=float
-    )
-    durations = np.array(
+    ons = np.asarray(events_dict["onset"], dtype=float)
+    durs = np.array(
         [0 if du == "n/a" else du for du in events_dict["duration"]], dtype=float
     )
 
-    # Keep only events where onset is known
-    good_events_idx = ~np.isnan(onsets)
-    onsets = onsets[good_events_idx]
-    durations = durations[good_events_idx]
-    descriptions = descriptions[good_events_idx]
-    del good_events_idx
-
-    # Add events as Annotations, but keep essential Annotations present in
-    # raw file
+    # Add events as Annotations, but keep essential Annotations present in raw file
     annot_from_raw = raw.annotations.copy()
-
-    annot_from_events = mne.Annotations(
-        onset=onsets, duration=durations, description=descriptions
-    )
+    annot_from_events = mne.Annotations(onset=ons, duration=durs, description=descrs)
     raw.set_annotations(annot_from_events)
 
     annot_idx_to_keep = [
@@ -622,7 +609,7 @@ def _handle_events_reading(events_fname, raw):
     if len(annot_to_keep):
         raw.set_annotations(raw.annotations + annot_to_keep)
 
-    return raw
+    return raw, event_id
 
 
 def _get_bads_from_tsv_data(tsv_data):
@@ -756,7 +743,9 @@ def _handle_channels_reading(channels_fname, raw):
 
 
 @verbose
-def read_raw_bids(bids_path, extra_params=None, verbose=None):
+def read_raw_bids(
+    bids_path, extra_params=None, *, return_event_dict=False, verbose=None
+):
     """Read BIDS compatible data.
 
     Will attempt to read associated events.tsv and channels.tsv files to
@@ -781,12 +770,21 @@ def read_raw_bids(bids_path, extra_params=None, verbose=None):
         Note that the ``exclude`` parameter, which is supported by some
         MNE-Python readers, is not supported; instead, you need to subset
         your channels **after** reading.
+    return_event_dict : bool
+        Whether to return a dictionary that maps annotation descriptions to integer
+        event IDs, in addition to the :class:`~mne.io.Raw` object. If a ``value`` column
+        is present in the ``*_events.tsv`` file, it will be used as the source of the
+        integer event ID values (events with ``value="n/a"`` will be omitted).
     %(verbose)s
 
     Returns
     -------
     raw : mne.io.Raw
         The data as MNE-Python Raw object.
+    event_id : dict
+        A mapping from event descriptions to integer event IDs, suitable for,
+        e.g., passing to :func:`mne.events_from_annotations`. Only returned if
+        ``return_event_dict=True``.
 
     Raises
     ------
@@ -923,9 +921,8 @@ def read_raw_bids(bids_path, extra_params=None, verbose=None):
     events_fname = _find_matching_sidecar(
         bids_path, suffix="events", extension=".tsv", on_error=on_error
     )
-
     if events_fname is not None:
-        raw = _handle_events_reading(events_fname, raw)
+        raw, event_id = _handle_events_reading(events_fname, raw)
 
     # Try to find an associated channels.tsv to get information about the
     # status and type of present channels
@@ -989,6 +986,8 @@ def read_raw_bids(bids_path, extra_params=None, verbose=None):
         raw.info["subject_info"] = dict()
 
     assert raw.annotations.orig_time == raw.info["meas_date"]
+    if return_event_dict:
+        return raw, event_id
     return raw
 
 
diff --git a/mne_bids/tests/test_read.py b/mne_bids/tests/test_read.py
@@ -509,8 +509,11 @@ def test_handle_events_reading(tmp_path):
     events_fname.parent.mkdir()
     _to_tsv(events, events_fname)
 
-    raw = _handle_events_reading(events_fname, raw)
-    events, event_id = mne.events_from_annotations(raw)
+    raw, event_id = _handle_events_reading(events_fname, raw)
+    ev_arr, ev_dict = mne.events_from_annotations(raw)
+    assert list(ev_dict.values()) == [1, 2]  # auto-assigned
+    want = len(events["onset"]) - 1  # one onset was n/a
+    assert want == len(raw.annotations) == len(ev_arr) == len(ev_dict)
 
     # Test with a `stim_type` column instead of `trial_type`.
     events = {
@@ -523,9 +526,24 @@ def test_handle_events_reading(tmp_path):
     _to_tsv(events, events_fname)
 
     with pytest.warns(RuntimeWarning, match="This column should be renamed"):
-        raw = _handle_events_reading(events_fname, raw)
+        raw, _ = _handle_events_reading(events_fname, raw)
     events, event_id = mne.events_from_annotations(raw)
 
+    # Test with only a `value` column.
+    events = {
+        "onset": [11, 12, 13, 14, 15],
+        "duration": ["n/a", "n/a", 0.1, 0.1, "n/a"],
+        "value": [3, 1, 1, 3, "n/a"],
+    }
+    events_fname = tmp_path / "bids3" / "sub-01_task-test_events.json"
+    events_fname.parent.mkdir()
+    _to_tsv(events, events_fname)
+
+    raw, event_id = _handle_events_reading(events_fname, raw)
+    ev_arr, ev_dict = mne.events_from_annotations(raw, event_id=event_id)
+    assert len(ev_arr) == len(events["value"]) - 1  # one value was n/a
+    assert {"1": 1, "3": 3} == event_id == ev_dict
+
     # Test with same `trial_type` referring to different `value`:
     # The events should be renamed automatically
     events = {
@@ -534,32 +552,32 @@ def test_handle_events_reading(tmp_path):
         "trial_type": ["event1", "event1", "event2", "event3", "event3"],
         "value": [1, 2, 3, 4, "n/a"],
     }
-    events_fname = tmp_path / "bids3" / "sub-01_task-test_events.json"
+    events_fname = tmp_path / "bids4" / "sub-01_task-test_events.json"
     events_fname.parent.mkdir()
     _to_tsv(events, events_fname)
 
-    raw = _handle_events_reading(events_fname, raw)
-    events, event_id = mne.events_from_annotations(raw)
-
-    assert len(events) == 5
-    assert "event1/1" in event_id
-    assert "event1/2" in event_id
-    assert "event3/4" in event_id
-    assert "event3/na" in event_id  # 'n/a' value should become 'na'
-    # The event with unique value mapping should not be renamed
-    assert "event2" in event_id
+    raw, event_id = _handle_events_reading(events_fname, raw)
+    ev_arr, ev_dict = mne.events_from_annotations(raw)
+    # `event_id` will exclude the last event, as its value is `n/a`, but `ev_dict` won't
+    # exclude it (it's made from annotations, which don't know about missing `value`s)
+    assert len(event_id) == len(ev_dict) - 1
+    # check the renaming
+    assert len(ev_arr) == 5
+    assert "event1/1" in ev_dict
+    assert "event1/2" in ev_dict
+    assert "event3/4" in ev_dict
+    assert "event3/na" in ev_dict  # 'n/a' value should become 'na'
+    assert "event2" in ev_dict  # has unique value mapping; should not be renamed
 
     # Test without any kind of event description.
     events = {"onset": [11, 12, "n/a"], "duration": ["n/a", "n/a", "n/a"]}
-    events_fname = tmp_path / "bids4" / "sub-01_task-test_events.json"
+    events_fname = tmp_path / "bids5" / "sub-01_task-test_events.json"
     events_fname.parent.mkdir()
     _to_tsv(events, events_fname)
 
-    raw = _handle_events_reading(events_fname, raw)
-    events, event_id = mne.events_from_annotations(raw)
-    ids = list(event_id.keys())
-    assert len(ids) == 1
-    assert ids == ["n/a"]
+    raw, event_id = _handle_events_reading(events_fname, raw)
+    ev_arr, ev_dict = mne.events_from_annotations(raw)
+    assert event_id == ev_dict == {"n/a": 1}  # fallback behavior
 
 
 @pytest.mark.filterwarnings(warning_str["channel_unit_changed"])