childmindresearch · clane9 · Jul 9, 2024 · Jul 9, 2024 · Jul 9, 2024 · Jul 9, 2024
diff --git a/bids2table/entities.py b/bids2table/entities.py
@@ -81,9 +81,7 @@ class _BIDSEntitiesBase:
     sub: str = bids_field(name="subject", display_name="Subject", required=True)
     ses: Optional[str] = bids_field(name="session", display_name="Session")
 
-    datatype: Optional[str] = bids_field(
-        name="datatype", display_name="Data type", allowed_values=BIDS_DATATYPES
-    )
+    datatype: Optional[str] = bids_field(name="datatype", display_name="Data type")
     suffix: Optional[str] = bids_field(name="suffix", display_name="Suffix")
     ext: Optional[str] = bids_field(name="extension", display_name="Extension")
     extra_entities: Optional[Dict[str, Union[str, int]]] = bids_field(
@@ -303,16 +301,12 @@ def parse_bids_entities(path: StrOrPath) -> Dict[str, str]:
     path = Path(path)
     entities = {}
 
-    # datatype
-    match = re.search(
-        f"/({'|'.join(BIDS_DATATYPES)})/",
-        path.as_posix(),
-    )
-    datatype = match.group(1) if match is not None else None
-
     filename = path.name
     parts = filename.split("_")
 
+    # datatype
+    datatype = parse_bids_datatype(path)
+
     # suffix and extension
     suffix_ext = parts.pop()
     idx = suffix_ext.find(".")
@@ -340,6 +334,15 @@ def parse_bids_entities(path: StrOrPath) -> Dict[str, str]:
     return entities
 
 
+def parse_bids_datatype(path: StrOrPath) -> Optional[str]:
+    match = re.search(BIDS_DATATYPE_PATTERN, Path(path).as_posix())
+    datatype = match.group(1) if match is not None else None
+    return datatype
+
+
+BIDS_DATATYPE_PATTERN = re.compile(r"/sub-[a-zA-Z0-9]+(?:/ses-[a-zA-Z0-9]+)?/([a-z]+)/")
+
+
 ENTITY_NAMES_TO_KEYS = MappingProxyType(
     {f.metadata["name"]: f.name for f in fields(BIDSEntities)}
 )
diff --git a/bids2table/extractors/_utils.py b/bids2table/extractors/_utils.py
@@ -0,0 +1,16 @@
+import os
+from functools import lru_cache
+from pathlib import Path
+from typing import List
+
+import pandas as pd
+
+
+@lru_cache()
+def _glob(path: Path, pattern: str) -> List[Path]:
+    return list(path.glob(pattern))
+
+
+@lru_cache()
+def _list_files(path: Path) -> pd.Series:
+    return pd.Series(os.listdir(path))
diff --git a/bids2table/extractors/bids.py b/bids2table/extractors/bids.py
@@ -7,7 +7,7 @@
 from elbow.sources.filesystem import Crawler
 from elbow.typing import StrOrPath
 
-from bids2table.entities import BIDSEntities
+from bids2table.entities import BIDSEntities, parse_bids_entities
 
 from .dataset import extract_dataset
 from .metadata import extract_metadata, is_associated_sidecar
@@ -53,14 +53,27 @@ def extract_bids_subdir(
 
 def is_bids_file(path: StrOrPath) -> bool:
     """
-    Check if `path` is a valid BIDS data file. E.g. not a directory or JSON sidecar
-    associated to another data file.
+    Check if `path` is a valid BIDS data file.
     """
     # TODO: other checks?
     #   - skip files matching patterns in .bidsignore?
     path = Path(path)
-    return (
-        not path.is_dir()
-        and path.name.startswith("sub-")
-        and not is_associated_sidecar(path)
-    )
+
+    # initial fast checks
+    if not (path.exists() and path.suffix != "" and path.name.startswith("sub-")):
+        return False
+
+    entities = parse_bids_entities(path)
+    if not (entities.get("suffix") and entities.get("datatype")):
+        return False
+
+    if is_associated_sidecar(path):
+        return False
+
+    # very special case for directories that are treated as bids "files"
+    # e.g. microscopy .ome.zarr directories or MEG .ds directories.
+    # Annoying that we have to do this.
+    if is_bids_file(path.parent):
+        return False
+
+    return True
diff --git a/bids2table/extractors/inheritance.py b/bids2table/extractors/inheritance.py
@@ -1,11 +1,11 @@
-from functools import lru_cache
 from pathlib import Path
-from typing import Dict, Generator, List, Optional
+from typing import Dict, Generator, Optional
 
 from elbow.typing import StrOrPath
 
 from bids2table.entities import parse_bids_entities
 
+from ._utils import _glob
 from .dataset import is_dataset_root
 
 
@@ -24,11 +24,11 @@ def find_bids_parents(
 
     Yields matching `path`s in decreasing topological order.
     """
-    suffix = query.get("suffix")
-    ext = query.get("ext")
+    suffix = query.get("suffix", "")
+    ext = query.get("ext", "")
     if not (suffix or ext):
         raise ValueError("At least one of 'suffix' or 'ext' are required in `query`.")
-    pattern = f"*{suffix}{ext}" if suffix else f"*{ext}"
+    pattern = f"*{suffix}{ext}"
 
     start = Path(start).absolute()
     if not start.is_dir():
@@ -62,11 +62,6 @@ def find_first_bids_parent(
     return next(find_bids_parents(query, start, depth), None)
 
 
-@lru_cache()
-def _glob(path: Path, pattern: str) -> List[Path]:
-    return list(path.glob(pattern))
-
-
 def _test_bids_match(query: Dict[str, str], entities: Dict[str, str]) -> bool:
     """
     Test if entities satisfies the inheritance principle for query.

diff --git a/bids2table/extractors/metadata.py b/bids2table/extractors/metadata.py
@@ -1,14 +1,16 @@
 import json
 import logging
 import traceback
+from functools import lru_cache
 from pathlib import Path
 
 from elbow.record import Record
 from elbow.typing import StrOrPath
 
 from bids2table.entities import parse_bids_entities
 
-from .inheritance import _glob, find_bids_parents
+from ._utils import _list_files
+from .inheritance import find_bids_parents
 
 logger = logging.getLogger(__name__)
 
@@ -42,13 +44,12 @@ def is_associated_sidecar(path: StrOrPath) -> bool:
     Check if a file is a JSON sidecar associated with other data file(s).
     """
     path = Path(path)
+    entities = parse_bids_entities(path)
 
     # Must be JSON
-    if not path.suffix == ".json":
+    if entities.get("ext") != ".json":
         return False
 
-    entities = parse_bids_entities(path)
-
     # Assume all JSON above the lowest level of hierarchy are associated
     if entities.get("datatype") is None:
         return True
@@ -59,10 +60,11 @@ def is_associated_sidecar(path: StrOrPath) -> bool:
         return False
 
     # Finally, check if there are any matches at the lowest level
-    # If not, we are a key-value file or solo sidecar like an MRIQC IQM JSON.
-    # Note this pattern always matches the file itself, so we check if there are any
-    # extra matches.
-    if len(_glob(path.parent, f"*_{suffix}.*")) > 1:
-        return True
+    # If not, we are a key-value file or solo JSON like an MRIQC IQM JSON.
+    return _contains_matches(path.parent, suffix)
+
 
-    return False
+@lru_cache()
+def _contains_matches(path: Path, suffix: str) -> bool:
+    file_list = _list_files(path)
+    return file_list.str.contains(f"_{suffix}(?!\\.json)").any()