Skip to content

Commit c605559

Browse files
authored
Pre-release fixes (#50)
* Minor cleanup * Add compound json test case for `_is_bids_file` * Add test case for indexing invalid/empty datasets * Remove commented line and add comment * Add test for `_hfmt` Nb, kind of excessive. * Typing fixes * Update pyproject.toml * Minor update
1 parent 6892ed1 commit c605559

8 files changed

Lines changed: 89 additions & 55 deletions

File tree

bids2table/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,5 +159,5 @@
159159
get_bids_entity_arrow_schema,
160160
format_bids_path,
161161
)
162-
from ._pathlib import Path, cloudpathlib_is_available
162+
from ._pathlib import cloudpathlib_is_available
163163
from ._version import *

bids2table/__main__.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
import argparse
22
import concurrent.futures
3-
import re
3+
import glob
44
import sys
55

66
import pyarrow.parquet as pq
77

88
import bids2table as b2t2
9-
from bids2table import Path
109
from bids2table._logging import setup_logger
10+
from bids2table._pathlib import as_path
1111

1212
_logger = setup_logger(__package__)
1313

@@ -116,8 +116,8 @@ def _index_command(args: argparse.Namespace):
116116

117117
root = []
118118
for path in args.root:
119-
if _is_glob(path):
120-
path = Path(path)
119+
if glob.has_magic(path):
120+
path = as_path(path)
121121
paths = list(path.parent.glob(path.name))
122122
root.extend(paths)
123123
else:
@@ -171,9 +171,5 @@ def _check_path(path: str):
171171
sys.exit(1)
172172

173173

174-
def _is_glob(path: str) -> bool:
175-
return bool(re.search(r"[*?\[\]]", path))
176-
177-
178174
if __name__ == "__main__":
179175
sys.exit(main())

bids2table/_entities.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -157,11 +157,8 @@ def parse_bids_entities(path: str | Path) -> dict[str, str]:
157157

158158
# Get suffix and extension.
159159
suffix_ext = parts.pop()
160-
idx = suffix_ext.find(".")
161-
if idx < 0:
162-
suffix, ext = suffix_ext, None
163-
else:
164-
suffix, ext = suffix_ext[:idx], suffix_ext[idx:]
160+
suffix, dot, ext = suffix_ext.partition(".")
161+
ext = dot + ext if ext else None
165162

166163
# Suffix is actually an entity, put back in list.
167164
if "-" in suffix:

bids2table/_indexing.py

Lines changed: 22 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
validate_bids_entities,
2222
)
2323
from ._logging import setup_logger
24-
from ._pathlib import Path
24+
from ._pathlib import PathT, as_path
2525

2626
_BIDS_SUBJECT_DIR_PATTERN = re.compile(r"sub-[a-zA-Z0-9]+")
2727

@@ -109,7 +109,7 @@ def get_arrow_schema() -> pa.Schema:
109109
return schema
110110

111111

112-
def get_column_names() -> enum.EnumType:
112+
def get_column_names() -> enum.StrEnum:
113113
"""Get an enum of the BIDS index columns."""
114114
# TODO: It might be nice if the column names were statically available. One option
115115
# would be to generate a static _schema.py module at install time (similar to how
@@ -127,11 +127,11 @@ def get_column_names() -> enum.EnumType:
127127

128128

129129
def find_bids_datasets(
130-
root: str | Path,
130+
root: str | PathT,
131131
exclude: str | list[str] | None = None,
132132
follow_symlinks: bool = True,
133133
log_frequency: int = 100,
134-
) -> Generator[Path, None, None]:
134+
) -> Generator[PathT, None, None]:
135135
"""Find all BIDS datasets under a root directory.
136136
137137
Args:
@@ -143,8 +143,7 @@ def find_bids_datasets(
143143
Yields:
144144
Root paths of all BIDS datasets under `root`.
145145
"""
146-
if isinstance(root, str):
147-
root = Path(root)
146+
root = as_path(root)
148147

149148
dir_count = 0
150149
ds_count = 0
@@ -178,7 +177,7 @@ def find_bids_datasets(
178177

179178

180179
def index_dataset(
181-
root: str | Path,
180+
root: str | PathT,
182181
include_subjects: str | list[str] | None = None,
183182
max_workers: int | None = 0,
184183
chunksize: int = 32,
@@ -203,8 +202,7 @@ def index_dataset(
203202
Returns:
204203
An Arrow table index of the BIDS dataset.
205204
"""
206-
if isinstance(root, str):
207-
root = Path(root)
205+
root = as_path(root)
208206

209207
schema = get_arrow_schema()
210208

@@ -243,7 +241,7 @@ def index_dataset(
243241

244242

245243
def batch_index_dataset(
246-
roots: list[str | Path],
244+
roots: list[str | PathT],
247245
max_workers: int | None = 0,
248246
executor_cls: type[Executor] = ProcessPoolExecutor,
249247
show_progress: bool = False,
@@ -275,13 +273,13 @@ def batch_index_dataset(
275273
yield table
276274

277275

278-
def _batch_index_func(root: str | Path) -> tuple[str, pa.Table]:
276+
def _batch_index_func(root: str | PathT) -> tuple[str | None, pa.Table]:
279277
dataset, _ = _get_bids_dataset(root)
280278
table = index_dataset(root, max_workers=0, show_progress=False)
281279
return dataset, table
282280

283281

284-
def _get_bids_dataset(path: str | Path) -> tuple[str | None, Path | None]:
282+
def _get_bids_dataset(path: str | PathT) -> tuple[str | None, PathT | None]:
285283
"""Get the BIDS dataset that the path belongs to, if any.
286284
287285
Return the dataset directory name and the full dataset path. For nested derivatives
@@ -290,13 +288,10 @@ def _get_bids_dataset(path: str | Path) -> tuple[str | None, Path | None]:
290288
291289
Note that the name is extracted from the path, not the dataset description JSON.
292290
"""
293-
if isinstance(path, str):
294-
path = Path(path)
295-
296-
parent = path
291+
parent = as_path(path)
297292
parts: list[str] = []
298293
scanning = False
299-
top_idx = None
294+
top_idx = 0
300295
root = None
301296

302297
while parent.name:
@@ -319,24 +314,24 @@ def _get_bids_dataset(path: str | Path) -> tuple[str | None, Path | None]:
319314
return dataset, root
320315

321316

322-
def _is_bids_dataset(path: Path) -> bool:
317+
def _is_bids_dataset(path: PathT) -> bool:
323318
"""Test if path is a BIDS dataset root directory."""
324319
# Check if contains a dataset_description.json or any subject directories. Note,
325320
# it's common for ppl to forget the dataset description, so let's not be too strict.
326321
description_exists = (path / "dataset_description.json").exists()
327322
return description_exists or _contains_bids_subject_dirs(path)
328323

329324

330-
def _contains_bids_subject_dirs(root: Path) -> bool:
325+
def _contains_bids_subject_dirs(root: PathT) -> bool:
331326
"""Check if a path contains one or more BIDS subject dirs."""
332327
# Nb, this will return on the first matching path thanks to the generator.
333328
return any(_is_bids_subject_dir(path) for path in root.glob("sub-*"))
334329

335330

336331
def _find_bids_subject_dirs(
337-
root: Path,
332+
root: PathT,
338333
include_subjects: str | list[str] | None = None,
339-
) -> list[Path]:
334+
) -> list[PathT]:
340335
"""Find all BIDS subject dirs contained in a root directory.
341336
342337
Note, only looks one level down. Does not find nested subject directories, e.g. in
@@ -352,7 +347,7 @@ def _find_bids_subject_dirs(
352347
return paths
353348

354349

355-
def _is_bids_subject_dir(path: Path) -> bool:
350+
def _is_bids_subject_dir(path: PathT) -> bool:
356351
"""Check if a path is a BIDS subject directory."""
357352
# NOTE: not checking if the path is in fact a directory.
358353
# This is a slow op, especially on cloud. Can assume that there are no files
@@ -362,7 +357,7 @@ def _is_bids_subject_dir(path: Path) -> bool:
362357

363358

364359
def _index_bids_subject_dir(
365-
path: Path,
360+
path: PathT,
366361
schema: pa.Schema | None = None,
367362
dataset: str | None = None,
368363
) -> tuple[str, pa.Table]:
@@ -394,7 +389,7 @@ def _index_bids_subject_dir(
394389
return subject, table
395390

396391

397-
def _is_bids_file(path: Path) -> bool:
392+
def _is_bids_file(path: PathT) -> bool:
398393
"""Check if file is a BIDS file.
399394
400395
Not very exact, but hopefully good enough.
@@ -407,7 +402,8 @@ def _is_bids_file(path: Path) -> bool:
407402
return False
408403

409404
entities = _cache_parse_bids_entities(path)
410-
# if not (entities.get("suffix") and entities.get("datatype")):
405+
# If we want to exclude metadata files like *_scans.tsv, we can also check for
406+
# datatype.
411407
if not (entities.get("suffix") and entities.get("ext")):
412408
return False
413409

@@ -422,7 +418,7 @@ def _is_bids_file(path: Path) -> bool:
422418
return True
423419

424420

425-
def _is_bids_json_sidecar(path: Path) -> bool:
421+
def _is_bids_json_sidecar(path: PathT) -> bool:
426422
"""Quick check if a file is a JSON sidecar."""
427423
# Quick check if path suffix is not json.
428424
if path.suffix != ".json":

bids2table/_pathlib.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,30 @@
11
from pathlib import Path
22

33
try:
4-
# Overshadow pathlib Path.
5-
from cloudpathlib import AnyPath as Path
4+
from cloudpathlib import AnyPath, CloudPath, S3Client
65

76
_CLOUDPATHLIB_AVAILABLE = True
7+
8+
# Set unsigned client as default for s3:// paths
9+
S3Client(no_sign_request=True).set_as_default_client()
10+
811
except ImportError:
12+
AnyPath = CloudPath = Path
13+
914
_CLOUDPATHLIB_AVAILABLE = False
1015

11-
__all__ = ["Path", "cloudpathlib_is_available"]
16+
__all__ = ["PathT", "as_path", "cloudpathlib_is_available"]
1217

18+
PathT = Path | CloudPath
1319

14-
def cloudpathlib_is_available() -> bool:
15-
"""Check if cloudpathlib is available."""
16-
return _CLOUDPATHLIB_AVAILABLE
1720

21+
def as_path(path: str | PathT) -> PathT:
22+
"""Cast input to a `Path` type."""
23+
if isinstance(path, str):
24+
return AnyPath(path)
25+
return path
1826

19-
if _CLOUDPATHLIB_AVAILABLE:
20-
# Set unsigned client as default for s3:// paths
21-
from cloudpathlib import S3Client
2227

23-
client = S3Client(no_sign_request=True)
24-
client.set_as_default_client()
28+
def cloudpathlib_is_available() -> bool:
29+
"""Check if cloudpathlib is available."""
30+
return _CLOUDPATHLIB_AVAILABLE

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ dependencies = [
3636

3737
[project.optional-dependencies]
3838
s3 = [
39-
"cloudpathlib[s3]==0.17.0",
39+
"cloudpathlib[s3]>=0.17.0",
4040
]
4141

4242
[dependency-groups]

tests/test_indexing.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
import logging
12
from pathlib import Path
23

34
import pyarrow as pa
45
import pytest
6+
from pytest import LogCaptureFixture
57

68
import bids2table._indexing as indexing
79
from bids2table._pathlib import cloudpathlib_is_available
@@ -72,6 +74,22 @@ def test_index_dataset_parallel(max_workers: int):
7274
assert len(table) == expected_count
7375

7476

77+
@pytest.mark.parametrize(
78+
"path,msg",
79+
[
80+
# Not a bids dataset.
81+
("tools", "not a valid BIDS"),
82+
# Has dataset_description.json but no valid subject dirs.
83+
("ieeg_epilepsy/derivatives/brainvisa", "no matching subject"),
84+
],
85+
)
86+
def test_index_dataset_warns(path: str, msg: str, caplog: LogCaptureFixture):
87+
with caplog.at_level(logging.WARNING):
88+
tab = indexing.index_dataset(BIDS_EXAMPLES / path)
89+
assert len(tab) == 0
90+
assert msg in caplog.text
91+
92+
7593
@pytest.mark.parametrize("max_workers", [0, 2])
7694
def test_batch_index_dataset(max_workers: int):
7795
datasets = list(indexing.find_bids_datasets(BIDS_EXAMPLES))
@@ -92,6 +110,7 @@ def test_batch_index_dataset(max_workers: int):
92110
def test_get_bids_dataset(path: str, expected_name: str):
93111
name, dataset_path = indexing._get_bids_dataset(BIDS_EXAMPLES / path)
94112
assert name == expected_name
113+
assert dataset_path is not None
95114
assert indexing._contains_bids_subject_dirs(dataset_path)
96115

97116

@@ -154,6 +173,11 @@ def test_is_bids_subject_dir(path: str, expected: bool):
154173
"eeg_face13/sub-010/eeg/sub-010_coordsystem.json",
155174
True,
156175
),
176+
(
177+
# JSON data file with compound extension.
178+
"sub-0025428_ses-1_hemi-L_space-native_midthickness.surf.json",
179+
True,
180+
),
157181
(
158182
# Special case of directory that is a bids "file".
159183
"ds000247/sub-0007/ses-0001/meg/sub-0007_ses-0001_task-rest_run-01_meg.ds/",
@@ -178,3 +202,18 @@ def test_filter_include_exclude():
178202
filtered_names = indexing._filter_include(names, include)
179203
filtered_names = indexing._filter_exclude(filtered_names, exclude)
180204
assert filtered_names == expected
205+
206+
207+
@pytest.mark.parametrize(
208+
"num,expected",
209+
[
210+
(12, "12"),
211+
(1234, "1234"),
212+
(65432, "65K"),
213+
(165432, "165K"),
214+
(2165432, "2.2M"),
215+
(52165432, "52M"),
216+
],
217+
)
218+
def test_h_fmt(num: int, expected: str):
219+
assert indexing._hfmt(num) == expected

uv.lock

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)