From b8c21a533f3d20458e34df4032617b92d0aab9fb Mon Sep 17 00:00:00 2001 From: Eric Larson Date: Tue, 25 Feb 2025 12:58:58 -0500 Subject: [PATCH 1/4] BUG: Fix bug with path search --- mne_bids/path.py | 13 +++++-------- mne_bids/tests/test_path.py | 23 ++++++++++++++++++++++- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/mne_bids/path.py b/mne_bids/path.py index 05ce32a5f..dc9c5e7cd 100644 --- a/mne_bids/path.py +++ b/mne_bids/path.py @@ -2014,6 +2014,8 @@ def get_entity_vals( ignore_suffixes : str | array-like of str | None Suffixes to ignore. If ``None``, include all suffixes. This can be helpful for ignoring non-data sidecars such as `*_scans.tsv` or `*_coordsystem.json`. + + .. versionadded:: 0.17 include_match : str | array-like of str | None Apply a starting match pragma following Unix style pattern syntax from package glob to prefilter search criterion. @@ -2542,24 +2544,19 @@ def _return_root_paths(root, datatype=None, ignore_json=True, ignore_nosub=False if datatype is None and not ignore_nosub: search_str = "*.*" - paths = root.rglob(search_str) else: if datatype is not None: datatype = _ensure_tuple(datatype) - search_str = f"**/{'|'.join(datatype)}/.*" + search_str = f"**/{'|'.join(datatype)}/*.*" else: search_str = "**/*.*" # only browse files which are of the form root/sub-*, # such that we truely only look in 'sub'-folders: - if ignore_nosub: - search_str = "sub-*/" + search_str + search_str = f"sub-*/{search_str}" - paths = [ - Path(root, fn) - for fn in glob.iglob(search_str, root_dir=root, recursive=True) - ] + paths = root.rglob(search_str) # Only keep files (not directories), ... # and omit the JSON sidecars if `ignore_json` is True. diff --git a/mne_bids/tests/test_path.py b/mne_bids/tests/test_path.py index 43fd2a37d..abe1996c1 100644 --- a/mne_bids/tests/test_path.py +++ b/mne_bids/tests/test_path.py @@ -1046,7 +1046,7 @@ def test_filter_fnames(entities, expected_n_matches): @testing.requires_testing_data -def test_match(return_bids_test_dir): +def test_match_basic(return_bids_test_dir): """Test retrieval of matching basenames.""" bids_root = Path(return_bids_test_dir) @@ -1140,6 +1140,27 @@ def test_match(return_bids_test_dir): bids_path_01.fpath.unlink() # clean up created file +def test_match_advanced(tmp_path): + """Test additional match functionality.""" + bids_root = tmp_path + fnames = ( + "sub-01/nirs/sub-01_task-tapping_events.tsv", + "sub-02/nirs/sub-02_task-tapping_events.tsv", + ) + for fname in fnames: + this_path = Path(bids_root / fname) + this_path.parent.mkdir(parents=True, exist_ok=True) + this_path.touch() + path = BIDSPath( + root=bids_root, + datatype="nirs", + suffix="events", + extension=".tsv", + ) + matches = path.match() + assert len(matches) == len(fnames), path + + @testing.requires_testing_data def test_find_matching_paths(return_bids_test_dir): """We test by yielding the same results as BIDSPath.match(). From 400bd7f687e75e7c1ca3ac20d5e95d1de905d2c1 Mon Sep 17 00:00:00 2001 From: Eric Larson Date: Tue, 25 Feb 2025 13:21:52 -0500 Subject: [PATCH 2/4] FIX: Simplify --- mne_bids/path.py | 8 +++-- mne_bids/tests/test_path.py | 59 ++++++++++++++----------------------- 2 files changed, 28 insertions(+), 39 deletions(-) diff --git a/mne_bids/path.py b/mne_bids/path.py index dc9c5e7cd..aaf6c4070 100644 --- a/mne_bids/path.py +++ b/mne_bids/path.py @@ -2544,6 +2544,7 @@ def _return_root_paths(root, datatype=None, ignore_json=True, ignore_nosub=False if datatype is None and not ignore_nosub: search_str = "*.*" + paths = root.rglob(search_str) else: if datatype is not None: datatype = _ensure_tuple(datatype) @@ -2555,8 +2556,11 @@ def _return_root_paths(root, datatype=None, ignore_json=True, ignore_nosub=False # such that we truely only look in 'sub'-folders: if ignore_nosub: search_str = f"sub-*/{search_str}" - - paths = root.rglob(search_str) + # TODO: Why is this not equivalent to list(root.rglob(search_str)) ? + paths = [ + Path(root, fn) + for fn in glob.iglob(search_str, root_dir=root, recursive=True) + ] # Only keep files (not directories), ... # and omit the JSON sidecars if `ignore_json` is True. diff --git a/mne_bids/tests/test_path.py b/mne_bids/tests/test_path.py index abe1996c1..2935eebc3 100644 --- a/mne_bids/tests/test_path.py +++ b/mne_bids/tests/test_path.py @@ -12,7 +12,6 @@ from pathlib import Path import mne -import numpy as np import pytest from mne.datasets import testing from mne.io import anonymize_info @@ -221,50 +220,36 @@ def test_path_benchmark(tmp_path_factory): # apply nosub on find_matching_matchs with root level bids directory should # yield a performance boost of order of length from bids_subdirectories. - timed_all, timed_ignored_nosub = ( - timeit.timeit( - "mne_bids.find_matching_paths(tmp_bids_root)", - setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'", - number=1, - ), - timeit.timeit( - "mne_bids.find_matching_paths(tmp_bids_root, ignore_nosub=True)", - setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'", - number=10, - ) - / 10, + setup = "import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'" + timed_all = timeit.timeit( + "mne_bids.find_matching_paths(tmp_bids_root)", setup=setup, number=1 ) - - assert ( - timed_all / (10 * np.maximum(1, np.floor(len(bids_subdirectories) / 10))) - # while this should be of same order, lets give it some space by a factor of 2 - > 0.5 * timed_ignored_nosub + timed_ignored_nosub = timeit.timeit( + "mne_bids.find_matching_paths(tmp_bids_root, ignore_nosub=True)", + setup=setup, + number=1, ) + # while this should be of same order, lets give it some space by a factor of 2 + target = 2 * timed_all / len(bids_subdirectories) + assert timed_ignored_nosub < target + # apply include_match on get_entity_vals with root level bids directory should # yield a performance boost of order of length from bids_subdirectories. - timed_entity, timed_entity_match = ( - timeit.timeit( - "mne_bids.get_entity_vals(tmp_bids_root, 'session')", - setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'", - number=1, - ), - timeit.timeit( - "mne_bids.get_entity_vals(tmp_bids_root, 'session', include_match='sub-*/')", # noqa: E501 - setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'", - number=10, - ) - / 10, + timed_entity = timeit.timeit( + "mne_bids.get_entity_vals(tmp_bids_root, 'session')", + setup=setup, + number=1, ) - - assert ( - timed_entity / (10 * np.maximum(1, np.floor(len(bids_subdirectories) / 10))) - # while this should be of same order, lets give it some space by a factor of 2 - > 0.5 * timed_entity_match + timed_entity_match = timeit.timeit( + "mne_bids.get_entity_vals(tmp_bids_root, 'session', include_match='sub-*/')", # noqa: E501 + setup=setup, + number=1, ) - # Clean up - shutil.rmtree(tmp_bids_root) + # while this should be of same order, lets give it some space by a factor of 2 + target = 2 * timed_entity / len(bids_subdirectories) + assert timed_entity_match < target def test_search_folder_for_text(capsys): From 93d30b6c546a4d27ffaffc91ea286791b6d5552e Mon Sep 17 00:00:00 2001 From: Eric Larson Date: Tue, 25 Feb 2025 13:50:08 -0500 Subject: [PATCH 3/4] FIX: More --- doc/whats_new.rst | 4 ++-- mne_bids/path.py | 13 ++++++------- mne_bids/tests/test_path.py | 13 +++++++++++-- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 9b36f5861..103b28559 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -36,7 +36,7 @@ Detailed list of changes - :func:`mne_bids.write_raw_bids()` can now handle mne `Raw` objects with `eyegaze` and `pupil` channels, by `Christian O'Reilly`_ (:gh:`1344`) - :func:`mne_bids.get_entity_vals()` has a new parameter ``ignore_suffixes`` to easily ignore sidecar files, by `Daniel McCloy`_ (:gh:`1362`) - Empty-room matching now preferentially finds recordings in the subject directory tagged as `task-noise` before looking in the `sub-emptyroom` directories. This adds support for a part of the BIDS specification for ER recordings, by `Berk Gerçek`_ (:gh:`1364`) -- Path matching is now implemenented in a more efficient manner within `_return_root_paths`, by `Arne Gottwald` (:gh:`1355`) +- Path matching is now implemenented in a more efficient manner within :meth:`mne_bids.BIDSPath.match()` and :func:`mne_bids.find_matching_paths()`, by `Arne Gottwald` (:gh:`1355`) - :func:`mne_bids.get_entity_vals()` has a new parameter ``include_match`` to prefilter item matching and ignore non-matched items from begin of directory scan, by `Arne Gottwald` (:gh:`1355`) @@ -56,7 +56,7 @@ Detailed list of changes - :func:`mne_bids.read_raw_bids` can optionally return an ``event_id`` dictionary suitable for use with :func:`mne.events_from_annotations`, and if a ``values`` column is present in ``events.tsv`` it will be used as the source of the integer event ID codes, by `Daniel McCloy`_ (:gh:`1349`) - BIDS dictates that the recording entity should be displayed as "_recording-" in the filename. This PR makes :class:`mne_bids.BIDSPath` correctly display "_recording-" (instead of "_rec-") in BIDSPath.fpath. By `Scott Huberty`_ (:gh:`1348`) - :func:`mne_bids.make_dataset_description` now correctly encodes the dataset description as UTF-8 on disk, by `Scott Huberty`_ (:gh:`1357`) -- Corrects extension match within `_filter_fnames` at end of filename, by `Arne Gottwald` (:gh:`1355`) +- Corrects extension when filtering filenames in :meth:`mne_bids.BIDSPath.match()` and :func:`mne_bids.find_matching_paths()`, by `Arne Gottwald` (:gh:`1355`) ⚕️ Code health ^^^^^^^^^^^^^^ diff --git a/mne_bids/path.py b/mne_bids/path.py index aaf6c4070..ad7cf67c9 100644 --- a/mne_bids/path.py +++ b/mne_bids/path.py @@ -12,7 +12,6 @@ from copy import deepcopy from datetime import datetime from io import StringIO -from itertools import chain as iter_chain from os import path as op from pathlib import Path from textwrap import indent @@ -2017,8 +2016,9 @@ def get_entity_vals( .. versionadded:: 0.17 include_match : str | array-like of str | None - Apply a starting match pragma following Unix style pattern syntax from - package glob to prefilter search criterion. + Glob-style match string(s) of directories to include in the search (i.e., each + must end with ``"/"``). ``None`` (the default) is equivalent to ``"**/"`` + (within any subdirectory of the BIDS root). .. versionadded:: 0.17 with_key : bool @@ -2119,8 +2119,7 @@ def get_entity_vals( search_str = f"**/*{entity_long_abbr_map[entity_key]}-*_*" if include_match is not None: include_match = _ensure_tuple(include_match) - filenames = [root.glob(im + search_str) for im in include_match] - filenames = iter_chain(*filenames) + filenames = [f for im in include_match for f in root.glob(im + search_str)] else: filenames = root.glob(search_str) @@ -2543,8 +2542,7 @@ def _return_root_paths(root, datatype=None, ignore_json=True, ignore_nosub=False root = Path(root) # if root is str if datatype is None and not ignore_nosub: - search_str = "*.*" - paths = root.rglob(search_str) + paths = root.rglob("*.*") else: if datatype is not None: datatype = _ensure_tuple(datatype) @@ -2557,6 +2555,7 @@ def _return_root_paths(root, datatype=None, ignore_json=True, ignore_nosub=False if ignore_nosub: search_str = f"sub-*/{search_str}" # TODO: Why is this not equivalent to list(root.rglob(search_str)) ? + # Most of the speedup is from using glob.iglob here. paths = [ Path(root, fn) for fn in glob.iglob(search_str, root_dir=root, recursive=True) diff --git a/mne_bids/tests/test_path.py b/mne_bids/tests/test_path.py index 2935eebc3..bc4cdec6b 100644 --- a/mne_bids/tests/test_path.py +++ b/mne_bids/tests/test_path.py @@ -171,8 +171,8 @@ def test_path_benchmark(tmp_path_factory): """Benchmark exploring bids tree.""" # This benchmark is to verify the speed-up in function call get_entity_vals with # `include_match=sub-*/` in face of a bids tree hosting derivatives and sourcedata. - n_subjects = 20 - n_sessions = 10 + n_subjects = 10 + n_sessions = 5 n_derivatives = 17 tmp_bids_root = tmp_path_factory.mktemp("mnebids_utils_test_bids_ds") @@ -251,6 +251,15 @@ def test_path_benchmark(tmp_path_factory): target = 2 * timed_entity / len(bids_subdirectories) assert timed_entity_match < target + # and these should be equivalent + out_1 = get_entity_vals(tmp_bids_root, "session") + out_2 = get_entity_vals(tmp_bids_root, "session", include_match="**/") + assert out_1 == out_2 + out_3 = get_entity_vals(tmp_bids_root, "session", include_match="sub-*/") + assert out_2 == out_3 # all are sub-* vals + out_4 = get_entity_vals(tmp_bids_root, "session", include_match="none/") + assert out_4 == [] + def test_search_folder_for_text(capsys): """Test finding entries.""" From d78d636b939becd57f6008952f90a210f089d4c3 Mon Sep 17 00:00:00 2001 From: Daniel McCloy Date: Tue, 25 Feb 2025 13:55:26 -0600 Subject: [PATCH 4/4] tweak docstring [ci skip] --- mne_bids/path.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mne_bids/path.py b/mne_bids/path.py index ad7cf67c9..5d2f95bb0 100644 --- a/mne_bids/path.py +++ b/mne_bids/path.py @@ -2016,9 +2016,9 @@ def get_entity_vals( .. versionadded:: 0.17 include_match : str | array-like of str | None - Glob-style match string(s) of directories to include in the search (i.e., each + Glob-style pattern(s) of *directories* to include in the search (i.e., each must end with ``"/"``). ``None`` (the default) is equivalent to ``"**/"`` - (within any subdirectory of the BIDS root). + (search within any subdirectory of the BIDS root). .. versionadded:: 0.17 with_key : bool