diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 9b36f5861..103b28559 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -36,7 +36,7 @@ Detailed list of changes - :func:`mne_bids.write_raw_bids()` can now handle mne `Raw` objects with `eyegaze` and `pupil` channels, by `Christian O'Reilly`_ (:gh:`1344`) - :func:`mne_bids.get_entity_vals()` has a new parameter ``ignore_suffixes`` to easily ignore sidecar files, by `Daniel McCloy`_ (:gh:`1362`) - Empty-room matching now preferentially finds recordings in the subject directory tagged as `task-noise` before looking in the `sub-emptyroom` directories. This adds support for a part of the BIDS specification for ER recordings, by `Berk Gerçek`_ (:gh:`1364`) -- Path matching is now implemenented in a more efficient manner within `_return_root_paths`, by `Arne Gottwald` (:gh:`1355`) +- Path matching is now implemenented in a more efficient manner within :meth:`mne_bids.BIDSPath.match()` and :func:`mne_bids.find_matching_paths()`, by `Arne Gottwald` (:gh:`1355`) - :func:`mne_bids.get_entity_vals()` has a new parameter ``include_match`` to prefilter item matching and ignore non-matched items from begin of directory scan, by `Arne Gottwald` (:gh:`1355`) @@ -56,7 +56,7 @@ Detailed list of changes - :func:`mne_bids.read_raw_bids` can optionally return an ``event_id`` dictionary suitable for use with :func:`mne.events_from_annotations`, and if a ``values`` column is present in ``events.tsv`` it will be used as the source of the integer event ID codes, by `Daniel McCloy`_ (:gh:`1349`) - BIDS dictates that the recording entity should be displayed as "_recording-" in the filename. This PR makes :class:`mne_bids.BIDSPath` correctly display "_recording-" (instead of "_rec-") in BIDSPath.fpath. By `Scott Huberty`_ (:gh:`1348`) - :func:`mne_bids.make_dataset_description` now correctly encodes the dataset description as UTF-8 on disk, by `Scott Huberty`_ (:gh:`1357`) -- Corrects extension match within `_filter_fnames` at end of filename, by `Arne Gottwald` (:gh:`1355`) +- Corrects extension when filtering filenames in :meth:`mne_bids.BIDSPath.match()` and :func:`mne_bids.find_matching_paths()`, by `Arne Gottwald` (:gh:`1355`) ⚕️ Code health ^^^^^^^^^^^^^^ diff --git a/mne_bids/path.py b/mne_bids/path.py index 05ce32a5f..5d2f95bb0 100644 --- a/mne_bids/path.py +++ b/mne_bids/path.py @@ -12,7 +12,6 @@ from copy import deepcopy from datetime import datetime from io import StringIO -from itertools import chain as iter_chain from os import path as op from pathlib import Path from textwrap import indent @@ -2014,9 +2013,12 @@ def get_entity_vals( ignore_suffixes : str | array-like of str | None Suffixes to ignore. If ``None``, include all suffixes. This can be helpful for ignoring non-data sidecars such as `*_scans.tsv` or `*_coordsystem.json`. + + .. versionadded:: 0.17 include_match : str | array-like of str | None - Apply a starting match pragma following Unix style pattern syntax from - package glob to prefilter search criterion. + Glob-style pattern(s) of *directories* to include in the search (i.e., each + must end with ``"/"``). ``None`` (the default) is equivalent to ``"**/"`` + (search within any subdirectory of the BIDS root). .. versionadded:: 0.17 with_key : bool @@ -2117,8 +2119,7 @@ def get_entity_vals( search_str = f"**/*{entity_long_abbr_map[entity_key]}-*_*" if include_match is not None: include_match = _ensure_tuple(include_match) - filenames = [root.glob(im + search_str) for im in include_match] - filenames = iter_chain(*filenames) + filenames = [f for im in include_match for f in root.glob(im + search_str)] else: filenames = root.glob(search_str) @@ -2541,21 +2542,20 @@ def _return_root_paths(root, datatype=None, ignore_json=True, ignore_nosub=False root = Path(root) # if root is str if datatype is None and not ignore_nosub: - search_str = "*.*" - paths = root.rglob(search_str) + paths = root.rglob("*.*") else: if datatype is not None: datatype = _ensure_tuple(datatype) - search_str = f"**/{'|'.join(datatype)}/.*" + search_str = f"**/{'|'.join(datatype)}/*.*" else: search_str = "**/*.*" # only browse files which are of the form root/sub-*, # such that we truely only look in 'sub'-folders: - if ignore_nosub: - search_str = "sub-*/" + search_str - + search_str = f"sub-*/{search_str}" + # TODO: Why is this not equivalent to list(root.rglob(search_str)) ? + # Most of the speedup is from using glob.iglob here. paths = [ Path(root, fn) for fn in glob.iglob(search_str, root_dir=root, recursive=True) diff --git a/mne_bids/tests/test_path.py b/mne_bids/tests/test_path.py index 43fd2a37d..bc4cdec6b 100644 --- a/mne_bids/tests/test_path.py +++ b/mne_bids/tests/test_path.py @@ -12,7 +12,6 @@ from pathlib import Path import mne -import numpy as np import pytest from mne.datasets import testing from mne.io import anonymize_info @@ -172,8 +171,8 @@ def test_path_benchmark(tmp_path_factory): """Benchmark exploring bids tree.""" # This benchmark is to verify the speed-up in function call get_entity_vals with # `include_match=sub-*/` in face of a bids tree hosting derivatives and sourcedata. - n_subjects = 20 - n_sessions = 10 + n_subjects = 10 + n_sessions = 5 n_derivatives = 17 tmp_bids_root = tmp_path_factory.mktemp("mnebids_utils_test_bids_ds") @@ -221,50 +220,45 @@ def test_path_benchmark(tmp_path_factory): # apply nosub on find_matching_matchs with root level bids directory should # yield a performance boost of order of length from bids_subdirectories. - timed_all, timed_ignored_nosub = ( - timeit.timeit( - "mne_bids.find_matching_paths(tmp_bids_root)", - setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'", - number=1, - ), - timeit.timeit( - "mne_bids.find_matching_paths(tmp_bids_root, ignore_nosub=True)", - setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'", - number=10, - ) - / 10, + setup = "import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'" + timed_all = timeit.timeit( + "mne_bids.find_matching_paths(tmp_bids_root)", setup=setup, number=1 ) - - assert ( - timed_all / (10 * np.maximum(1, np.floor(len(bids_subdirectories) / 10))) - # while this should be of same order, lets give it some space by a factor of 2 - > 0.5 * timed_ignored_nosub + timed_ignored_nosub = timeit.timeit( + "mne_bids.find_matching_paths(tmp_bids_root, ignore_nosub=True)", + setup=setup, + number=1, ) + # while this should be of same order, lets give it some space by a factor of 2 + target = 2 * timed_all / len(bids_subdirectories) + assert timed_ignored_nosub < target + # apply include_match on get_entity_vals with root level bids directory should # yield a performance boost of order of length from bids_subdirectories. - timed_entity, timed_entity_match = ( - timeit.timeit( - "mne_bids.get_entity_vals(tmp_bids_root, 'session')", - setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'", - number=1, - ), - timeit.timeit( - "mne_bids.get_entity_vals(tmp_bids_root, 'session', include_match='sub-*/')", # noqa: E501 - setup="import mne_bids\ntmp_bids_root=r'" + str(tmp_bids_root) + "'", - number=10, - ) - / 10, + timed_entity = timeit.timeit( + "mne_bids.get_entity_vals(tmp_bids_root, 'session')", + setup=setup, + number=1, ) - - assert ( - timed_entity / (10 * np.maximum(1, np.floor(len(bids_subdirectories) / 10))) - # while this should be of same order, lets give it some space by a factor of 2 - > 0.5 * timed_entity_match + timed_entity_match = timeit.timeit( + "mne_bids.get_entity_vals(tmp_bids_root, 'session', include_match='sub-*/')", # noqa: E501 + setup=setup, + number=1, ) - # Clean up - shutil.rmtree(tmp_bids_root) + # while this should be of same order, lets give it some space by a factor of 2 + target = 2 * timed_entity / len(bids_subdirectories) + assert timed_entity_match < target + + # and these should be equivalent + out_1 = get_entity_vals(tmp_bids_root, "session") + out_2 = get_entity_vals(tmp_bids_root, "session", include_match="**/") + assert out_1 == out_2 + out_3 = get_entity_vals(tmp_bids_root, "session", include_match="sub-*/") + assert out_2 == out_3 # all are sub-* vals + out_4 = get_entity_vals(tmp_bids_root, "session", include_match="none/") + assert out_4 == [] def test_search_folder_for_text(capsys): @@ -1046,7 +1040,7 @@ def test_filter_fnames(entities, expected_n_matches): @testing.requires_testing_data -def test_match(return_bids_test_dir): +def test_match_basic(return_bids_test_dir): """Test retrieval of matching basenames.""" bids_root = Path(return_bids_test_dir) @@ -1140,6 +1134,27 @@ def test_match(return_bids_test_dir): bids_path_01.fpath.unlink() # clean up created file +def test_match_advanced(tmp_path): + """Test additional match functionality.""" + bids_root = tmp_path + fnames = ( + "sub-01/nirs/sub-01_task-tapping_events.tsv", + "sub-02/nirs/sub-02_task-tapping_events.tsv", + ) + for fname in fnames: + this_path = Path(bids_root / fname) + this_path.parent.mkdir(parents=True, exist_ok=True) + this_path.touch() + path = BIDSPath( + root=bids_root, + datatype="nirs", + suffix="events", + extension=".tsv", + ) + matches = path.match() + assert len(matches) == len(fnames), path + + @testing.requires_testing_data def test_find_matching_paths(return_bids_test_dir): """We test by yielding the same results as BIDSPath.match().