Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@ name: CI

on:
push:
branches: [ "main" ]
branches: ["main"]
pull_request:
branches: [ "main" ]
branches: ["main"]

env:
UV_FROZEN: true

jobs:
test:
format:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
submodules: 'true'
submodules: "true"
- name: Install uv
uses: astral-sh/setup-uv@v5
with:
Expand All @@ -26,9 +26,24 @@ jobs:
run: |
uv run ruff check bids2table tests
uv run ruff format --check bids2table tests

tests:
runs-on: ubuntu-latest
needs: format
strategy:
matrix:
python-version: ["3.11", "3.12", "3.13"]
steps:
- uses: actions/checkout@v4
with:
submodules: "true"
- name: Install uv with python version
uses: astral-sh/setup-uv@v6
with:
python-version: ${{ matrix.python-version }}
- name: Run tests
run: |
uv run pytest \
uv run --all-extras pytest \
--junitxml=pytest.xml \
--cov-report=xml:coverage.xml \
--cov=bids2table tests
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[![Docs](https://github.com/childmindresearch/bids2table/actions/workflows/docs.yaml/badge.svg?branch=main)](https://childmindresearch.github.io/bids2table/bids2table)
[![codecov](https://codecov.io/gh/childmindresearch/bids2table/branch/main/graph/badge.svg?token=22HWWFWPW5)](https://codecov.io/gh/childmindresearch/bids2table)
[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
![Python3](https://img.shields.io/badge/python->=3.12-blue.svg)
![Python3](https://img.shields.io/badge/python->=3.11-blue.svg)
[![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)

Index [BIDS](https://bids-specification.readthedocs.io/en/stable/) datasets fast, locally or in the cloud.
Expand Down
2 changes: 1 addition & 1 deletion bids2table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
[![Docs](https://github.com/childmindresearch/bids2table/actions/workflows/docs.yaml/badge.svg?branch=main)](https://childmindresearch.github.io/bids2table/bids2table)
[![codecov](https://codecov.io/gh/childmindresearch/bids2table/branch/main/graph/badge.svg?token=22HWWFWPW5)](https://codecov.io/gh/childmindresearch/bids2table)
[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
![Python3](https://img.shields.io/badge/python->=3.12-blue.svg)
![Python3](https://img.shields.io/badge/python->=3.11-blue.svg)
[![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)

Index [BIDS](https://bids-specification.readthedocs.io/en/stable/) datasets fast, locally or in the cloud.
Expand Down
11 changes: 4 additions & 7 deletions bids2table/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ def main():
parser_index.set_defaults(func=_index_command)

parser_find = subparsers.add_parser("find", help="Find BIDS datasets.")
parser_find.add_argument(
"--maxdepth", type=int, help="Max search depth", default=None
)
parser_find.add_argument(
"--exclude-dirs",
metavar="DIR",
Expand All @@ -75,12 +78,6 @@ def main():
default=None,
help="List of directory names or glob patterns to exclude from search.",
)
parser_find.add_argument(
"--follow-symlinks",
"-L",
action="store_true",
help="Follow symbolic links.",
)
parser_find.add_argument(
"--verbose",
"-v",
Expand Down Expand Up @@ -157,7 +154,7 @@ def _find_command(args: argparse.Namespace):
for dataset in b2t2.find_bids_datasets(
args.root,
exclude=args.exclude_dirs,
follow_symlinks=args.follow_symlinks,
maxdepth=args.maxdepth,
):
print(dataset)

Expand Down
85 changes: 51 additions & 34 deletions bids2table/_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,51 +129,66 @@
def find_bids_datasets(
root: str | PathT,
exclude: str | list[str] | None = None,
follow_symlinks: bool = True,
log_frequency: int = 100,
maxdepth: int | None = None,
Comment thread
clane9 marked this conversation as resolved.
) -> Generator[PathT, None, None]:
"""Find all BIDS datasets under a root directory.

Args:
root: Root path to begin search.
exclude: Glob pattern or list of patterns matching sub-directory names to
exclude from the search.
follow_symlinks: Search into symlinks that point to directories.
maxdepth: Maximum depth to search.

Yields:
Root paths of all BIDS datasets under `root`.
"""
root = as_path(root)

dir_count = 0
if isinstance(exclude, str):
exclude = [exclude]

Check warning on line 148 in bids2table/_indexing.py

View check run for this annotation

Codecov / codecov/patch

bids2table/_indexing.py#L148

Added line #L148 was not covered by tests
elif exclude is None:
exclude = []
exclude = [re.compile(fnmatch.translate(pat)) for pat in exclude]

entry_count = 1
ds_count = 0

# NOTE: Path.walk was introduced in 3.12. Otherwise, could use an older python.
for dirpath, dirnames, _ in root.walk(follow_symlinks=follow_symlinks):
dir_count += 1
if _is_bids_dataset(root):
ds_count += 1
yield root

Check warning on line 158 in bids2table/_indexing.py

View check run for this annotation

Codecov / codecov/patch

bids2table/_indexing.py#L157-L158

Added lines #L157 - L158 were not covered by tests

if _is_bids_dataset(dirpath):
ds_count += 1
yield dirpath
# Tuple of path, depth
stack = [(root, 0)]

# Only descend into specific sub-directories that are allowed to contain
# sub-datasets.
_filter_dirnames(dirnames, _BIDS_NESTED_PARENT_DIRNAMES)
while stack:
top, depth = stack.pop()

# Filter sub-directories to descend into.
if exclude:
matches = _filter_exclude(dirnames, exclude)
_filter_dirnames(dirnames, matches)
inside_bids = _is_bids_dataset(top)
depth += 1

if log_frequency and dir_count % log_frequency == 0:
_logger.info(
"Searched %d directories; found %d BIDS datasets.", dir_count, ds_count
)
for entry in top.iterdir():
entry_count += 1

if log_frequency:
_logger.info(
"Searched %d directories; found %d BIDS datasets.", dir_count, ds_count
)
if any(re.fullmatch(pat, entry.name) for pat in exclude):
continue

if _is_bids_dataset(entry):
ds_count += 1
yield entry

# Checks if we should descend into this directory.
# Check not reached final depth.
descend = maxdepth is None or depth < maxdepth
Comment thread
clane9 marked this conversation as resolved.
# Heuristic checks whether the filename looks like a (visible) directory.
descend = descend and not (entry.suffix or entry.name.startswith("."))
# Only descend into specific subdirectories of BIDS directories.
descend = descend and (
not inside_bids or entry.name in _BIDS_NESTED_PARENT_DIRNAMES
)
# Finally, check if actually a directory (which is slow so we want to
# short-circuit as much as possible).
if descend and entry.is_dir():
stack.append((entry, depth))


def index_dataset(
Expand Down Expand Up @@ -316,6 +331,17 @@

def _is_bids_dataset(path: PathT) -> bool:
"""Test if path is a BIDS dataset root directory."""
# Quick heuristic checks.
# BIDS datasets should not contain a file extension.
if path.suffix:
return False
# Path should not be hidden.
if path.name.startswith("."):
return False
# Subject dirs are not datasets.
if _is_bids_subject_dir(path):
return False

# Check if contains a dataset_description.json or any subject directories. Note,
# it's common for ppl to forget the dataset description, so let's not be too strict.
description_exists = (path / "dataset_description.json").exists()
Expand Down Expand Up @@ -493,15 +519,6 @@
return matching_names


def _filter_dirnames(dirnames: list[str], matches: set[str]) -> None:
"""Remove dirnames matching `matches` in place."""
# Iterate in reversed order since we are modifying in place.
n_names = len(dirnames)
for ii, dirname in enumerate(reversed(dirnames)):
if dirname not in matches:
del dirnames[n_names - ii - 1]


def _hfmt(n: int) -> str:
if n < 10_000:
n_fmt = str(n)
Expand Down
21 changes: 6 additions & 15 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,11 @@ build-backend = "setuptools.build_meta"
[project]
name = "bids2table"
dynamic = ["version"]
authors = [
{ name = "Connor Lane", email = "connor.lane858@gmail.com" },
]
authors = [{ name = "Connor Lane", email = "connor.lane858@gmail.com" }]
description = "Index BIDS datasets fast, locally or in the cloud."
readme = "README.md"
requires-python = ">=3.12"
license = {text = "MIT License"}
requires-python = ">=3.11"
license = { text = "MIT License" }
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
Expand All @@ -20,24 +18,17 @@ classifiers = [
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
"License :: OSI Approved :: MIT License",
"Operating System :: POSIX",
"Operating System :: Unix",
"Operating System :: MacOS",
"Operating System :: Microsoft :: Windows",
]

dependencies = [
"bidsschematools>=1.0",
"pyarrow>=14.0.2",
"tqdm>=4.66.2",
]
dependencies = ["bidsschematools>=1.0", "pyarrow>=20.0.0", "tqdm>=4.67.1"]

[project.optional-dependencies]
s3 = [
"cloudpathlib[s3]>=0.17.0",
]
s3 = ["cloudpathlib[s3]>=0.21.0"]

[dependency-groups]
dev = [
Expand All @@ -48,7 +39,7 @@ dev = [
"pre-commit>=4.1.0",
"pytest>=8.3.5",
"pytest-cov>=6.0.0",
"ruff>=0.9.10",
"ruff>=0.11.9",
]

[project.urls]
Expand Down
30 changes: 25 additions & 5 deletions tests/test_indexing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from itertools import islice
from pathlib import Path

import pyarrow as pa
Expand All @@ -25,24 +26,43 @@ def test_get_column_names():


def test_find_bids_datasets():
datasets = sorted(indexing.find_bids_datasets(BIDS_EXAMPLES, log_frequency=100))
datasets = sorted(
indexing.find_bids_datasets(
BIDS_EXAMPLES,
exclude=["surfaces", "subjects", "code", "sourcedata"],
)
)
expected_datasets = sorted(
[p.parent for p in BIDS_EXAMPLES.rglob("dataset_description.json")]
)
# find_bids_datasets finds a few extra derivative datasets that are missing a
# dataset_description.json.
assert set(expected_datasets).issubset(datasets)
assert len(datasets) == len(expected_datasets) + 6
assert len(datasets) == len(expected_datasets) + 3

datasets_no_derivatives = sorted(
indexing.find_bids_datasets(BIDS_EXAMPLES, exclude="derivatives")
indexing.find_bids_datasets(
BIDS_EXAMPLES,
exclude=["derivatives", "code", "sourcedata"],
)
)
expected_datasets_no_derivatives = sorted(
[p.parent for p in BIDS_EXAMPLES.glob("*/dataset_description.json")]
)
assert datasets_no_derivatives == expected_datasets_no_derivatives


def test_find_bids_datasets_s3():
root = "s3://openneuro.org"
datasets = list(islice(indexing.find_bids_datasets(root, maxdepth=2), 10))
names = sorted([ds.name for ds in datasets])
expected_names = [
"ds000001", "ds000002", "ds000003", "ds000005", "ds000006",
"ds000007", "ds000008", "ds000009", "ds000011", "ds000017",
] # fmt: skip
assert names == expected_names


@pytest.mark.parametrize(
"root,expected_count",
[
Expand Down Expand Up @@ -92,12 +112,12 @@ def test_index_dataset_warns(path: str, msg: str, caplog: LogCaptureFixture):

@pytest.mark.parametrize("max_workers", [0, 2])
def test_batch_index_dataset(max_workers: int):
datasets = list(indexing.find_bids_datasets(BIDS_EXAMPLES))
datasets = list(BIDS_EXAMPLES.glob("*"))
tables = indexing.batch_index_dataset(
datasets, max_workers=max_workers, show_progress=False
)
table = pa.concat_tables(tables)
assert len(table) == 10133
assert len(table) == 9727


@pytest.mark.parametrize(
Expand Down
19 changes: 10 additions & 9 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,6 @@

BIDS_EXAMPLES = Path(__file__).parents[1] / "bids-examples"

COMMANDS = [
"find {examples}",
"index -o {out_dir}/ds102.parquet {examples}/ds102",
"index -o {out_dir}/ds101_ds102.parquet {examples}/ds101 {examples}/ds102",
"index -o {out_dir}/ds10N.parquet '{examples}/ds10?'",
]


@contextmanager
def patch_argv(argv: List[str]):
Expand All @@ -31,7 +24,6 @@ def patch_argv(argv: List[str]):
@pytest.mark.parametrize(
"cmd,output",
[
("find {examples}", None),
("index -o {out_dir}/ds102.parquet {examples}/ds102", "ds102.parquet"),
(
"index -o {out_dir}/ds101_ds102.parquet {examples}/ds101 {examples}/ds102",
Expand All @@ -40,7 +32,7 @@ def patch_argv(argv: List[str]):
("index -o {out_dir}/ds10N.parquet '{examples}/ds10?'", "ds10N.parquet"),
],
)
def test_main(cmd: str, output: str | None, tmp_path: Path):
def test_main_index(cmd: str, output: str | None, tmp_path: Path):
cmd_fmt = cmd.format(out_dir=tmp_path, examples=BIDS_EXAMPLES)
prog = str(Path(cli.__file__).absolute())
argv = [prog] + shlex.split(cmd_fmt)
Expand All @@ -49,3 +41,12 @@ def test_main(cmd: str, output: str | None, tmp_path: Path):

if output:
assert (tmp_path / output).exists()


@pytest.mark.parametrize("cmd", ["find {examples}"])
def test_main_find(cmd: str):
cmd_fmt = cmd.format(examples=BIDS_EXAMPLES)
prog = str(Path(cli.__file__).absolute())
argv = [prog] + shlex.split(cmd_fmt)
with patch_argv(argv):
cli.main()
Loading
Loading