Skip to content
23 changes: 13 additions & 10 deletions bids2table/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,7 @@ class _BIDSEntitiesBase:
sub: str = bids_field(name="subject", display_name="Subject", required=True)
ses: Optional[str] = bids_field(name="session", display_name="Session")

datatype: Optional[str] = bids_field(
name="datatype", display_name="Data type", allowed_values=BIDS_DATATYPES
)
datatype: Optional[str] = bids_field(name="datatype", display_name="Data type")
suffix: Optional[str] = bids_field(name="suffix", display_name="Suffix")
ext: Optional[str] = bids_field(name="extension", display_name="Extension")
extra_entities: Optional[Dict[str, Union[str, int]]] = bids_field(
Expand Down Expand Up @@ -303,16 +301,12 @@ def parse_bids_entities(path: StrOrPath) -> Dict[str, str]:
path = Path(path)
entities = {}

# datatype
match = re.search(
f"/({'|'.join(BIDS_DATATYPES)})/",
path.as_posix(),
)
datatype = match.group(1) if match is not None else None

filename = path.name
parts = filename.split("_")

# datatype
datatype = parse_bids_datatype(path)

# suffix and extension
suffix_ext = parts.pop()
idx = suffix_ext.find(".")
Expand Down Expand Up @@ -340,6 +334,15 @@ def parse_bids_entities(path: StrOrPath) -> Dict[str, str]:
return entities


def parse_bids_datatype(path: StrOrPath) -> Optional[str]:
match = re.search(BIDS_DATATYPE_PATTERN, Path(path).as_posix())
datatype = match.group(1) if match is not None else None
return datatype


BIDS_DATATYPE_PATTERN = re.compile(r"/sub-[a-zA-Z0-9]+(?:/ses-[a-zA-Z0-9]+)?/([a-z]+)/")


ENTITY_NAMES_TO_KEYS = MappingProxyType(
{f.metadata["name"]: f.name for f in fields(BIDSEntities)}
)
16 changes: 16 additions & 0 deletions bids2table/extractors/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import os
from functools import lru_cache
from pathlib import Path
from typing import List

import pandas as pd


@lru_cache()
def _glob(path: Path, pattern: str) -> List[Path]:
return list(path.glob(pattern))


@lru_cache()
def _list_files(path: Path) -> pd.Series:
return pd.Series(os.listdir(path))
29 changes: 21 additions & 8 deletions bids2table/extractors/bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from elbow.sources.filesystem import Crawler
from elbow.typing import StrOrPath

from bids2table.entities import BIDSEntities
from bids2table.entities import BIDSEntities, parse_bids_entities

from .dataset import extract_dataset
from .metadata import extract_metadata, is_associated_sidecar
Expand Down Expand Up @@ -53,14 +53,27 @@ def extract_bids_subdir(

def is_bids_file(path: StrOrPath) -> bool:
"""
Check if `path` is a valid BIDS data file. E.g. not a directory or JSON sidecar
associated to another data file.
Check if `path` is a valid BIDS data file.
"""
# TODO: other checks?
# - skip files matching patterns in .bidsignore?
path = Path(path)
return (
not path.is_dir()
and path.name.startswith("sub-")
and not is_associated_sidecar(path)
)

# initial fast checks
if not (path.exists() and path.suffix != "" and path.name.startswith("sub-")):
return False

entities = parse_bids_entities(path)
if not (entities.get("suffix") and entities.get("datatype")):
return False

if is_associated_sidecar(path):
return False

# very special case for directories that are treated as bids "files"
# e.g. microscopy .ome.zarr directories or MEG .ds directories.
# Annoying that we have to do this.
if is_bids_file(path.parent):
return False

return True
15 changes: 5 additions & 10 deletions bids2table/extractors/inheritance.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from functools import lru_cache
from pathlib import Path
from typing import Dict, Generator, List, Optional
from typing import Dict, Generator, Optional

from elbow.typing import StrOrPath

from bids2table.entities import parse_bids_entities

from ._utils import _glob
from .dataset import is_dataset_root


Expand All @@ -24,11 +24,11 @@ def find_bids_parents(

Yields matching `path`s in decreasing topological order.
"""
suffix = query.get("suffix")
ext = query.get("ext")
suffix = query.get("suffix", "")
ext = query.get("ext", "")
if not (suffix or ext):
raise ValueError("At least one of 'suffix' or 'ext' are required in `query`.")
pattern = f"*{suffix}{ext}" if suffix else f"*{ext}"
pattern = f"*{suffix}{ext}"

start = Path(start).absolute()
if not start.is_dir():
Expand Down Expand Up @@ -62,11 +62,6 @@ def find_first_bids_parent(
return next(find_bids_parents(query, start, depth), None)


@lru_cache()
def _glob(path: Path, pattern: str) -> List[Path]:
return list(path.glob(pattern))


def _test_bids_match(query: Dict[str, str], entities: Dict[str, str]) -> bool:
"""
Test if entities satisfies the inheritance principle for query.
Expand Down
22 changes: 12 additions & 10 deletions bids2table/extractors/metadata.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import json
import logging
import traceback
from functools import lru_cache
from pathlib import Path

from elbow.record import Record
from elbow.typing import StrOrPath

from bids2table.entities import parse_bids_entities

from .inheritance import _glob, find_bids_parents
from ._utils import _list_files
from .inheritance import find_bids_parents

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -42,13 +44,12 @@ def is_associated_sidecar(path: StrOrPath) -> bool:
Check if a file is a JSON sidecar associated with other data file(s).
"""
path = Path(path)
entities = parse_bids_entities(path)

# Must be JSON
if not path.suffix == ".json":
if entities.get("ext") != ".json":
return False

entities = parse_bids_entities(path)

# Assume all JSON above the lowest level of hierarchy are associated
if entities.get("datatype") is None:
return True
Expand All @@ -59,10 +60,11 @@ def is_associated_sidecar(path: StrOrPath) -> bool:
return False

# Finally, check if there are any matches at the lowest level
# If not, we are a key-value file or solo sidecar like an MRIQC IQM JSON.
# Note this pattern always matches the file itself, so we check if there are any
# extra matches.
if len(_glob(path.parent, f"*_{suffix}.*")) > 1:
return True
# If not, we are a key-value file or solo JSON like an MRIQC IQM JSON.
return _contains_matches(path.parent, suffix)


return False
@lru_cache()
def _contains_matches(path: Path, suffix: str) -> bool:
file_list = _list_files(path)
return file_list.str.contains(f"_{suffix}(?!\\.json)").any()
Loading