Revert "Rework mne bids path match (#1355)"

drammock · web-flow · commit 517707abf24f · 2025-02-24T11:28:56.000-06:00
This reverts commit 7b04694.
diff --git a/doc/authors.rst b/doc/authors.rst
@@ -5,12 +5,10 @@
 .. _Amaia Benitez: https://github.com/AmaiaBA
 .. _Anand Saini: https://github.com/anandsaini024
 .. _Ariel Rokem: https://github.com/arokem
-.. _Arne Gottwald: https://github.com/waldie11
 .. _Austin Hurst: https://github.com/a-hurst
 .. _Berk Gerçek: https://github.com/berkgercek
 .. _Bruno Hebling Vieira: https://github.com/bhvieira
 .. _Chris Holdgraf: https://bids.berkeley.edu/people/chris-holdgraf
-.. _Christian O'Reilly: https://github.com/christian-oreilly
 .. _Clemens Brunner: https://github.com/cbrnr
 .. _Daniel McCloy: http://dan.mccloy.info
 .. _Denis Engemann: https://github.com/dengemann
@@ -54,3 +52,4 @@
 .. _Tom Donoghue: https://github.com/TomDonoghue
 .. _William Turner: https://bootstrapbill.github.io/
 .. _Yorguin Mantilla: https://github.com/yjmantilla
+.. _Christian O'Reilly: https://github.com/christian-oreilly
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -19,7 +19,6 @@ The following authors contributed for the first time. Thank you so much! 🤩
 
 * `Christian O'Reilly`_
 * `Berk Gerçek`_
-* `Arne Gottwald`_
 
 The following authors had contributed before. Thank you for sticking around! 🤘
 
@@ -36,8 +35,6 @@ Detailed list of changes
 - :func:`mne_bids.write_raw_bids()` can now handle mne `Raw` objects with `eyegaze` and `pupil` channels, by `Christian O'Reilly`_ (:gh:`1344`)
 - :func:`mne_bids.get_entity_vals()` has a new parameter ``ignore_suffixes`` to easily ignore sidecar files, by `Daniel McCloy`_ (:gh:`1362`)
 - Empty-room matching now preferentially finds recordings in the subject directory tagged as `task-noise` before looking in the `sub-emptyroom` directories. This adds support for a part of the BIDS specification for ER recordings, by `Berk Gerçek`_ (:gh:`1364`)
-- Path matching is now implemenented in a more efficient manner within `_return_root_paths`, by `Arne Gottwald` (:gh:`1355`)
-- :func:`mne_bids.get_entity_vals()` has a new parameter ``include_match`` to prefilter item matching and ignore non-matched items from begin of directory scan, by `Arne Gottwald` (:gh:`1355`)
 
 
 🧐 API and behavior changes
@@ -56,7 +53,6 @@ Detailed list of changes
 - :func:`mne_bids.read_raw_bids` can optionally return an ``event_id`` dictionary suitable for use with :func:`mne.events_from_annotations`, and if a ``values`` column is present in ``events.tsv`` it will be used as the source of the integer event ID codes, by `Daniel McCloy`_ (:gh:`1349`)
 - BIDS dictates that the recording entity should be displayed as "_recording-" in the filename. This PR makes :class:`mne_bids.BIDSPath`  correctly display "_recording-" (instead of "_rec-") in BIDSPath.fpath. By `Scott Huberty`_ (:gh:`1348`)
 - :func:`mne_bids.make_dataset_description` now correctly encodes the dataset description as UTF-8 on disk, by `Scott Huberty`_ (:gh:`1357`)
-- Corrects extension match within `_filter_fnames` at end of filename, by `Arne Gottwald` (:gh:`1355`)
 
 ⚕️ Code health
 ^^^^^^^^^^^^^^
diff --git a/mne_bids/path.py b/mne_bids/path.py
@@ -12,7 +12,6 @@
 from copy import deepcopy
 from datetime import datetime
 from io import StringIO
-from itertools import chain as iter_chain
 from os import path as op
 from pathlib import Path
 from textwrap import indent
@@ -1463,11 +1462,9 @@ def _truncate_tsv_line(line, lim=10):
     """Truncate a line to the specified number of characters."""
     return "".join(
         [
-            (
-                str(val) + (lim - len(val)) * " "
-                if len(val) < lim
-                else f"{val[: lim - 1]} "
-            )
+            str(val) + (lim - len(val)) * " "
+            if len(val) < lim
+            else f"{val[: lim - 1]} "
             for val in line.split("\t")
         ]
     )
@@ -1949,7 +1946,6 @@ def get_entity_vals(
     ignore_datatypes=None,
     ignore_dirs=("derivatives", "sourcedata"),
     ignore_suffixes=None,
-    include_match=None,
     with_key=False,
     verbose=None,
 ):
@@ -2014,9 +2010,6 @@ def get_entity_vals(
     ignore_suffixes : str | array-like of str | None
         Suffixes to ignore. If ``None``, include all suffixes. This can be helpful for
         ignoring non-data sidecars such as `*_scans.tsv` or `*_coordsystem.json`.
-    include_match : str | array-like of str | None
-        Apply a starting match pragma following Unix style pattern syntax from
-        package glob to prefilter search criterion.
 
         .. versionadded:: 0.17
     with_key : bool
@@ -2114,13 +2107,7 @@ def get_entity_vals(
 
     p = re.compile(rf"{entity_long_abbr_map[entity_key]}-(.*?)_")
     values = list()
-    search_str = f"**/*{entity_long_abbr_map[entity_key]}-*_*"
-    if include_match is not None:
-        include_match = _ensure_tuple(include_match)
-        filenames = [root.glob(im + search_str) for im in include_match]
-        filenames = iter_chain(*filenames)
-    else:
-        filenames = root.glob(search_str)
+    filenames = root.glob(f"**/*{entity_long_abbr_map[entity_key]}-*_*")
 
     for filename in filenames:
         # Skip ignored directories
@@ -2371,7 +2358,7 @@ def _filter_fnames(
         r"_desc-(" + "|".join(description) + ")" if description else r"(|_desc-([^_]+))"
     )
     suffix_str = r"_(" + "|".join(suffix) + ")" if suffix else r"_([^_]+)"
-    ext_str = r"(" + "|".join(extension) + ")$" if extension else r".([^_]+)"
+    ext_str = r"(" + "|".join(extension) + ")" if extension else r".([^_]+)"
 
     regexp = (
         leading_path_str
@@ -2515,7 +2502,7 @@ def find_matching_paths(
 
 
 def _return_root_paths(root, datatype=None, ignore_json=True, ignore_nosub=False):
-    """Return all file paths + .ds paths in root.
+    """Return all file paths in root.
 
     Can be filtered by datatype (which is present in the path but not in
     the BIDSPath basename). Can also be list of datatypes.
@@ -2536,50 +2523,29 @@ def _return_root_paths(root, datatype=None, ignore_json=True, ignore_nosub=False
     Returns
     -------
     paths : list of pathlib.Path
-        All files + .ds paths in `root`, filtered according to the function parameters.
+        All paths in `root`, filtered according to the function parameters.
     """
     root = Path(root)  # if root is str
 
-    if datatype is None and not ignore_nosub:
-        search_str = "*.*"
-        paths = root.rglob(search_str)
+    if datatype is not None:
+        datatype = _ensure_tuple(datatype)
+        search_str = f"*/{'|'.join(datatype)}/*"
     else:
-        if datatype is not None:
-            datatype = _ensure_tuple(datatype)
-            search_str = f"**/{'|'.join(datatype)}/.*"
-        else:
-            search_str = "**/*.*"
-
-        # only browse files which are of the form root/sub-*,
-        # such that we truely only look in 'sub'-folders:
-
-        if ignore_nosub:
-            search_str = "sub-*/" + search_str
-
-        paths = [
-            Path(root, fn)
-            for fn in glob.iglob(search_str, root_dir=root, recursive=True)
-        ]
+        search_str = "*.*"
 
+    paths = root.rglob(search_str)
     # Only keep files (not directories), ...
     # and omit the JSON sidecars if `ignore_json` is True.
     if ignore_json:
-        paths = [
-            p
-            for p in paths
-            if (p.is_file() and p.suffix != ".json")
-            # XXX: generalize with a private func that takes
-            # a config of which "data format" are to be expected like .ds
-            or (p.is_dir() and p.suffix == ".ds")
-        ]
+        paths = [p for p in paths if p.is_file() and p.suffix != ".json"]
     else:
-        paths = [
-            p
-            for p in paths
-            if p.is_file()
-            # XXX: see above, generalize with private func
-            or (p.is_dir() and p.suffix == ".ds")
-        ]
+        paths = [p for p in paths if p.is_file()]
+
+    # only keep files which are of the form root/sub-*,
+    # such that we only look in 'sub'-folders:
+    if ignore_nosub:
+        root_sub = str(root / "sub-")
+        paths = [p for p in paths if str(p).startswith(root_sub)]
 
     return paths
 
diff --git a/mne_bids/tests/test_path.py b/mne_bids/tests/test_path.py
@@ -7,12 +7,10 @@
 import os.path as op
 import shutil
 import shutil as sh
-import timeit
 from datetime import datetime, timezone
 from pathlib import Path
 
 import mne
-import numpy as np
 import pytest
 from mne.datasets import testing
 from mne.io import anonymize_info
@@ -168,105 +166,6 @@ def test_get_entity_vals(entity, expected_vals, kwargs, return_bids_test_dir):
     shutil.rmtree(deriv_path)
 
 
-def test_path_benchmark(tmp_path_factory):
-    """Benchmark exploring bids tree."""
-    # This benchmark is to verify the speed-up in function call get_entity_vals with
-    # `include_match=sub-*/` in face of a bids tree hosting derivatives and sourcedata.
-    n_subjects = 20
-    n_sessions = 10
-    n_derivatives = 17
-    tmp_bids_root = tmp_path_factory.mktemp("mnebids_utils_test_bids_ds")
-
-    derivatives = [
-        Path("derivatives", "derivatives" + str(i)) for i in range(n_derivatives)
-    ]
-
-    bids_subdirectories = ["", "sourcedata", *derivatives]
-
-    # Create a BIDS compliant directory tree with high number of branches
-    for i in range(1, n_subjects):
-        for j in range(1, n_sessions):
-            for subdir in bids_subdirectories:
-                for datatype in ["eeg", "meg"]:
-                    bids_subdir = BIDSPath(
-                        subject=str(i),
-                        session=str(j),
-                        datatype=datatype,
-                        task="audvis",
-                        root=str(tmp_bids_root / subdir),
-                    )
-                    bids_subdir.mkdir(exist_ok=True)
-                    Path(bids_subdir.root / "participants.tsv").touch()
-                    Path(bids_subdir.root / "participants.csv").touch()
-                    Path(bids_subdir.root / "README").touch()
-
-                    # os.makedirs(bids_subdir.directory, exist_ok=True)
-                    Path(
-                        bids_subdir.directory, bids_subdir.basename + "_events.tsv"
-                    ).touch()
-                    Path(
-                        bids_subdir.directory, bids_subdir.basename + "_events.csv"
-                    ).touch()
-
-                    if datatype == "meg":
-                        ctf_path = Path(
-                            bids_subdir.directory, bids_subdir.basename + "_meg.ds"
-                        )
-                        ctf_path.mkdir(exist_ok=True)
-                        Path(ctf_path, bids_subdir.basename + ".meg4").touch()
-                        Path(ctf_path, bids_subdir.basename + ".hc").touch()
-                        Path(ctf_path / "hz.ds").mkdir(exist_ok=True)
-                        Path(ctf_path / "hz.ds" / "hz.meg4").touch()
-                        Path(ctf_path / "hz.ds" / "hz.hc").touch()
-
-    # apply nosub on find_matching_matchs with root level bids directory should
-    # yield a performance boost of order of length from bids_subdirectories.
-    timed_all, timed_ignored_nosub = (
-        timeit.timeit(
-            "mne_bids.find_matching_paths(tmp_bids_root)",
-            setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'",
-            number=1,
-        ),
-        timeit.timeit(
-            "mne_bids.find_matching_paths(tmp_bids_root, ignore_nosub=True)",
-            setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'",
-            number=10,
-        )
-        / 10,
-    )
-
-    assert (
-        timed_all / (10 * np.maximum(1, np.floor(len(bids_subdirectories) / 10)))
-        # while this should be of same order, lets give it some space by a factor of 2
-        > 0.5 * timed_ignored_nosub
-    )
-
-    # apply include_match on get_entity_vals with root level bids directory should
-    # yield a performance boost of order of length from bids_subdirectories.
-    timed_entity, timed_entity_match = (
-        timeit.timeit(
-            "mne_bids.get_entity_vals(tmp_bids_root, 'session')",
-            setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'",
-            number=1,
-        ),
-        timeit.timeit(
-            "mne_bids.get_entity_vals(tmp_bids_root, 'session', include_match='sub-*/')",  # noqa: E501
-            setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'",
-            number=10,
-        )
-        / 10,
-    )
-
-    assert (
-        timed_entity / (10 * np.maximum(1, np.floor(len(bids_subdirectories) / 10)))
-        # while this should be of same order, lets give it some space by a factor of 2
-        > 0.5 * timed_entity_match
-    )
-
-    # Clean up
-    shutil.rmtree(tmp_bids_root)
-
-
 def test_search_folder_for_text(capsys):
     """Test finding entries."""
     with pytest.raises(ValueError, match="is not a directory"):