99import importlib .metadata
1010import re
1111import sys
12+ import json
1213from concurrent .futures import Executor , ProcessPoolExecutor
1314from functools import lru_cache , partial
1415from glob import glob
2526from ._logging import setup_logger
2627from ._pathlib import CloudPath , PathT , as_path , cloudpathlib_is_available
2728
28- _BIDS_SUBJECT_DIR_PATTERN = re .compile (r"sub-[a-zA-Z0-9]+" )
29+ from bidsschematools .schema import load_schema
30+
31+ SCHEMA = load_schema ()
2932
3033# Path names of BIDS dataset sub-directories that may contain nested BIDS datasets.
3134# Other candidates to consider including:
3235# - sourcedata
3336# - code
37+ # TODO: Remove this and replace where it is invoked with reference from bidsschematools
3438_BIDS_NESTED_PARENT_DIRNAMES = {
3539 "derivatives" ,
3640}
4044# matching non-json files at the same level. But that is a lot of work to do for a few
4145# special cases. Rather, we just list the special case suffixes here. (Honestly, using
4246# plain json extension for data files should be discouraged.)
47+ # TODO: Remove this and replace where it is invoked with reference from bidsschematools
4348_BIDS_JSON_SIDECAR_EXCEPTION_SUFFIXES = {
4449 "coordsystem" ,
4550}
@@ -185,6 +190,7 @@ def find_bids_datasets(
185190 descend = descend and not (entry .suffix or entry .name .startswith ("." ))
186191 # Only descend into specific subdirectories of BIDS directories.
187192 descend = descend and (
193+ # TODO: Remove this and replace where it is invoked with reference from bidsschematools
188194 not inside_bids or entry .name in _BIDS_NESTED_PARENT_DIRNAMES
189195 )
190196 # Finally, check if actually a directory (which is slow so we want to
@@ -323,16 +329,37 @@ def _is_bids_dataset(path: PathT) -> bool:
323329 if _is_bids_subject_dir (path ):
324330 return False
325331
326- # Check if contains a dataset_description.json or any subject directories. Note,
327- # it's common for ppl to forget the dataset description, so let's not be too strict.
332+ # Check if contains a dataset_description.json or is a derivatives directory
328333 description_exists = (path / "dataset_description.json" ).exists ()
329- return description_exists or _contains_bids_subject_dirs (path )
334+
335+ if description_exists :
336+ try :
337+ with open (path / "dataset_description.json" ) as f :
338+ desc = json .load (f )
339+ dataset_type = desc .get ("DatasetType" , "raw" )
340+ if dataset_type == "raw" :
341+ return True
342+ elif dataset_type == "derivative" :
343+ return _contains_bids_subject_dirs (path ) or any (
344+ p .is_dir ()
345+ for p in path .iterdir ()
346+ # TODO: Pull these valid paths from bidsschematools
347+ if p .name in {"derivatives" , "code" , "logs" }
348+ )
349+ except (json .JSONDecodeError , OSError ):
350+ pass
351+
352+ return False
330353
331354
332355def _contains_bids_subject_dirs (root : PathT ) -> bool :
333356 """Check if a path contains one or more BIDS subject dirs."""
334- # Nb, this will return on the first matching path thanks to the generator.
335- return any (_is_bids_subject_dir (path ) for path in root .glob ("sub-*" ))
357+ if not root .is_dir ():
358+ return False
359+ return any (
360+ _is_bids_subject_dir (path )
361+ for path in root .iterdir ()
362+ )
336363
337364
338365def _find_bids_subject_dirs (
@@ -344,7 +371,11 @@ def _find_bids_subject_dirs(
344371 Note, only looks one level down. Does not find nested subject directories, e.g. in
345372 derivatives datasets.
346373 """
347- paths = [path for path in root .glob ("sub-*" ) if _is_bids_subject_dir (path )]
374+ paths = [
375+ path
376+ for path in root .iterdir ()
377+ if _is_bids_subject_dir (path )
378+ ]
348379
349380 if include_subjects :
350381 filtered_names = _filter_include (
@@ -356,11 +387,9 @@ def _find_bids_subject_dirs(
356387
357388def _is_bids_subject_dir (path : PathT ) -> bool :
358389 """Check if a path is a BIDS subject directory."""
359- # NOTE: not checking if the path is in fact a directory.
360- # This is a slow op, especially on cloud. Can assume that there are no files
361- # matching the subject dir pattern, and even if there are, the rglob that happens
362- # later will just return empty.
363- return bool (re .fullmatch (_BIDS_SUBJECT_DIR_PATTERN , path .name ))
390+ subject_entity = SCHEMA ["objects" ]["entities" ]["subject" ]
391+ subject_re = re .compile (subject_entity .get ("pattern" , r"sub-[a-zA-Z0-9]+" ))
392+ return bool (re .fullmatch (subject_re , path .name ))
364393
365394
366395def _index_bids_subject_dir (
@@ -455,6 +484,7 @@ def _is_bids_json_sidecar(path: PathT) -> bool:
455484 # All sidecars must contain a suffix.
456485 # Also check if suffix matches special cases of data files with json extension.
457486 suffix = entities .get ("suffix" )
487+ # TODO: Remove this and replace where it is invoked with reference from bidsschematools
458488 if suffix is None or suffix in _BIDS_JSON_SIDECAR_EXCEPTION_SUFFIXES :
459489 return False
460490 return True
0 commit comments