Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bids2table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,5 +159,5 @@
get_bids_entity_arrow_schema,
format_bids_path,
)
from ._pathlib import Path, cloudpathlib_is_available
from ._pathlib import cloudpathlib_is_available
from ._version import *
12 changes: 4 additions & 8 deletions bids2table/__main__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import argparse
import concurrent.futures
import re
import glob
import sys

import pyarrow.parquet as pq

import bids2table as b2t2
from bids2table import Path
from bids2table._logging import setup_logger
from bids2table._pathlib import as_path

_logger = setup_logger(__package__)

Expand Down Expand Up @@ -116,8 +116,8 @@ def _index_command(args: argparse.Namespace):

root = []
for path in args.root:
if _is_glob(path):
path = Path(path)
if glob.has_magic(path):
path = as_path(path)
paths = list(path.parent.glob(path.name))
root.extend(paths)
else:
Expand Down Expand Up @@ -171,9 +171,5 @@ def _check_path(path: str):
sys.exit(1)


def _is_glob(path: str) -> bool:
return bool(re.search(r"[*?\[\]]", path))


if __name__ == "__main__":
sys.exit(main())
7 changes: 2 additions & 5 deletions bids2table/_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,8 @@ def parse_bids_entities(path: str | Path) -> dict[str, str]:

# Get suffix and extension.
suffix_ext = parts.pop()
idx = suffix_ext.find(".")
if idx < 0:
suffix, ext = suffix_ext, None
else:
suffix, ext = suffix_ext[:idx], suffix_ext[idx:]
suffix, dot, ext = suffix_ext.partition(".")
ext = dot + ext if ext else None

# Suffix is actually an entity, put back in list.
if "-" in suffix:
Expand Down
48 changes: 22 additions & 26 deletions bids2table/_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
validate_bids_entities,
)
from ._logging import setup_logger
from ._pathlib import Path
from ._pathlib import PathT, as_path

_BIDS_SUBJECT_DIR_PATTERN = re.compile(r"sub-[a-zA-Z0-9]+")

Expand Down Expand Up @@ -109,7 +109,7 @@ def get_arrow_schema() -> pa.Schema:
return schema


def get_column_names() -> enum.EnumType:
def get_column_names() -> enum.StrEnum:
"""Get an enum of the BIDS index columns."""
# TODO: It might be nice if the column names were statically available. One option
# would be to generate a static _schema.py module at install time (similar to how
Expand All @@ -127,11 +127,11 @@ def get_column_names() -> enum.EnumType:


def find_bids_datasets(
root: str | Path,
root: str | PathT,
exclude: str | list[str] | None = None,
follow_symlinks: bool = True,
log_frequency: int = 100,
) -> Generator[Path, None, None]:
) -> Generator[PathT, None, None]:
"""Find all BIDS datasets under a root directory.

Args:
Expand All @@ -143,8 +143,7 @@ def find_bids_datasets(
Yields:
Root paths of all BIDS datasets under `root`.
"""
if isinstance(root, str):
root = Path(root)
root = as_path(root)

dir_count = 0
ds_count = 0
Expand Down Expand Up @@ -178,7 +177,7 @@ def find_bids_datasets(


def index_dataset(
root: str | Path,
root: str | PathT,
include_subjects: str | list[str] | None = None,
max_workers: int | None = 0,
chunksize: int = 32,
Expand All @@ -203,8 +202,7 @@ def index_dataset(
Returns:
An Arrow table index of the BIDS dataset.
"""
if isinstance(root, str):
root = Path(root)
root = as_path(root)

schema = get_arrow_schema()

Expand Down Expand Up @@ -243,7 +241,7 @@ def index_dataset(


def batch_index_dataset(
roots: list[str | Path],
roots: list[str | PathT],
max_workers: int | None = 0,
executor_cls: type[Executor] = ProcessPoolExecutor,
show_progress: bool = False,
Expand Down Expand Up @@ -275,13 +273,13 @@ def batch_index_dataset(
yield table


def _batch_index_func(root: str | Path) -> tuple[str, pa.Table]:
def _batch_index_func(root: str | PathT) -> tuple[str | None, pa.Table]:
dataset, _ = _get_bids_dataset(root)
table = index_dataset(root, max_workers=0, show_progress=False)
return dataset, table


def _get_bids_dataset(path: str | Path) -> tuple[str | None, Path | None]:
def _get_bids_dataset(path: str | PathT) -> tuple[str | None, PathT | None]:
"""Get the BIDS dataset that the path belongs to, if any.

Return the dataset directory name and the full dataset path. For nested derivatives
Expand All @@ -290,13 +288,10 @@ def _get_bids_dataset(path: str | Path) -> tuple[str | None, Path | None]:

Note that the name is extracted from the path, not the dataset description JSON.
"""
if isinstance(path, str):
path = Path(path)

parent = path
parent = as_path(path)
parts: list[str] = []
scanning = False
top_idx = None
top_idx = 0
root = None

while parent.name:
Expand All @@ -319,24 +314,24 @@ def _get_bids_dataset(path: str | Path) -> tuple[str | None, Path | None]:
return dataset, root


def _is_bids_dataset(path: Path) -> bool:
def _is_bids_dataset(path: PathT) -> bool:
"""Test if path is a BIDS dataset root directory."""
# Check if contains a dataset_description.json or any subject directories. Note,
# it's common for ppl to forget the dataset description, so let's not be too strict.
description_exists = (path / "dataset_description.json").exists()
return description_exists or _contains_bids_subject_dirs(path)


def _contains_bids_subject_dirs(root: Path) -> bool:
def _contains_bids_subject_dirs(root: PathT) -> bool:
"""Check if a path contains one or more BIDS subject dirs."""
# Nb, this will return on the first matching path thanks to the generator.
return any(_is_bids_subject_dir(path) for path in root.glob("sub-*"))


def _find_bids_subject_dirs(
root: Path,
root: PathT,
include_subjects: str | list[str] | None = None,
) -> list[Path]:
) -> list[PathT]:
"""Find all BIDS subject dirs contained in a root directory.

Note, only looks one level down. Does not find nested subject directories, e.g. in
Expand All @@ -352,7 +347,7 @@ def _find_bids_subject_dirs(
return paths


def _is_bids_subject_dir(path: Path) -> bool:
def _is_bids_subject_dir(path: PathT) -> bool:
"""Check if a path is a BIDS subject directory."""
# NOTE: not checking if the path is in fact a directory.
# This is a slow op, especially on cloud. Can assume that there are no files
Expand All @@ -362,7 +357,7 @@ def _is_bids_subject_dir(path: Path) -> bool:


def _index_bids_subject_dir(
path: Path,
path: PathT,
schema: pa.Schema | None = None,
dataset: str | None = None,
) -> tuple[str, pa.Table]:
Expand Down Expand Up @@ -394,7 +389,7 @@ def _index_bids_subject_dir(
return subject, table


def _is_bids_file(path: Path) -> bool:
def _is_bids_file(path: PathT) -> bool:
"""Check if file is a BIDS file.

Not very exact, but hopefully good enough.
Expand All @@ -407,7 +402,8 @@ def _is_bids_file(path: Path) -> bool:
return False

entities = _cache_parse_bids_entities(path)
# if not (entities.get("suffix") and entities.get("datatype")):
# If we want to exclude metadata files like *_scans.tsv, we can also check for
# datatype.
if not (entities.get("suffix") and entities.get("ext")):
return False

Expand All @@ -422,7 +418,7 @@ def _is_bids_file(path: Path) -> bool:
return True


def _is_bids_json_sidecar(path: Path) -> bool:
def _is_bids_json_sidecar(path: PathT) -> bool:
"""Quick check if a file is a JSON sidecar."""
# Quick check if path suffix is not json.
if path.suffix != ".json":
Expand Down
28 changes: 17 additions & 11 deletions bids2table/_pathlib.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,30 @@
from pathlib import Path

try:
# Overshadow pathlib Path.
from cloudpathlib import AnyPath as Path
from cloudpathlib import AnyPath, CloudPath, S3Client

_CLOUDPATHLIB_AVAILABLE = True

# Set unsigned client as default for s3:// paths
S3Client(no_sign_request=True).set_as_default_client()

except ImportError:
AnyPath = CloudPath = Path

Check warning on line 12 in bids2table/_pathlib.py

View check run for this annotation

Codecov / codecov/patch

bids2table/_pathlib.py#L12

Added line #L12 was not covered by tests

_CLOUDPATHLIB_AVAILABLE = False

__all__ = ["Path", "cloudpathlib_is_available"]
__all__ = ["PathT", "as_path", "cloudpathlib_is_available"]

PathT = Path | CloudPath

def cloudpathlib_is_available() -> bool:
"""Check if cloudpathlib is available."""
return _CLOUDPATHLIB_AVAILABLE

def as_path(path: str | PathT) -> PathT:
"""Cast input to a `Path` type."""
if isinstance(path, str):
return AnyPath(path)
return path

if _CLOUDPATHLIB_AVAILABLE:
# Set unsigned client as default for s3:// paths
from cloudpathlib import S3Client

client = S3Client(no_sign_request=True)
client.set_as_default_client()
def cloudpathlib_is_available() -> bool:
"""Check if cloudpathlib is available."""
return _CLOUDPATHLIB_AVAILABLE
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ dependencies = [

[project.optional-dependencies]
s3 = [
"cloudpathlib[s3]==0.17.0",
"cloudpathlib[s3]>=0.17.0",
]

[dependency-groups]
Expand Down
39 changes: 39 additions & 0 deletions tests/test_indexing.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
from pathlib import Path

import pyarrow as pa
import pytest
from pytest import LogCaptureFixture

import bids2table._indexing as indexing
from bids2table._pathlib import cloudpathlib_is_available
Expand Down Expand Up @@ -72,6 +74,22 @@ def test_index_dataset_parallel(max_workers: int):
assert len(table) == expected_count


@pytest.mark.parametrize(
"path,msg",
[
# Not a bids dataset.
("tools", "not a valid BIDS"),
# Has dataset_description.json but no valid subject dirs.
("ieeg_epilepsy/derivatives/brainvisa", "no matching subject"),
],
)
def test_index_dataset_warns(path: str, msg: str, caplog: LogCaptureFixture):
with caplog.at_level(logging.WARNING):
tab = indexing.index_dataset(BIDS_EXAMPLES / path)
assert len(tab) == 0
assert msg in caplog.text


@pytest.mark.parametrize("max_workers", [0, 2])
def test_batch_index_dataset(max_workers: int):
datasets = list(indexing.find_bids_datasets(BIDS_EXAMPLES))
Expand All @@ -92,6 +110,7 @@ def test_batch_index_dataset(max_workers: int):
def test_get_bids_dataset(path: str, expected_name: str):
name, dataset_path = indexing._get_bids_dataset(BIDS_EXAMPLES / path)
assert name == expected_name
assert dataset_path is not None
assert indexing._contains_bids_subject_dirs(dataset_path)


Expand Down Expand Up @@ -154,6 +173,11 @@ def test_is_bids_subject_dir(path: str, expected: bool):
"eeg_face13/sub-010/eeg/sub-010_coordsystem.json",
True,
),
(
# JSON data file with compound extension.
"sub-0025428_ses-1_hemi-L_space-native_midthickness.surf.json",
True,
),
(
# Special case of directory that is a bids "file".
"ds000247/sub-0007/ses-0001/meg/sub-0007_ses-0001_task-rest_run-01_meg.ds/",
Expand All @@ -178,3 +202,18 @@ def test_filter_include_exclude():
filtered_names = indexing._filter_include(names, include)
filtered_names = indexing._filter_exclude(filtered_names, exclude)
assert filtered_names == expected


@pytest.mark.parametrize(
"num,expected",
[
(12, "12"),
(1234, "1234"),
(65432, "65K"),
(165432, "165K"),
(2165432, "2.2M"),
(52165432, "52M"),
],
)
def test_h_fmt(num: int, expected: str):
assert indexing._hfmt(num) == expected
6 changes: 3 additions & 3 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading