Pre-release fixes (#50)

clane9 · web-flow · commit c605559f2633 · 2025-05-08T16:15:08.000-04:00
* Minor cleanup

* Add compound json test case for `_is_bids_file`

* Add test case for indexing invalid/empty datasets

* Remove commented line and add comment

* Add test for `_hfmt`

Nb, kind of excessive.

* Typing fixes

* Update pyproject.toml

* Minor update
diff --git a/bids2table/__init__.py b/bids2table/__init__.py
@@ -159,5 +159,5 @@
     get_bids_entity_arrow_schema,
     format_bids_path,
 )
-from ._pathlib import Path, cloudpathlib_is_available
+from ._pathlib import cloudpathlib_is_available
 from ._version import *
diff --git a/bids2table/__main__.py b/bids2table/__main__.py
@@ -1,13 +1,13 @@
 import argparse
 import concurrent.futures
-import re
+import glob
 import sys
 
 import pyarrow.parquet as pq
 
 import bids2table as b2t2
-from bids2table import Path
 from bids2table._logging import setup_logger
+from bids2table._pathlib import as_path
 
 _logger = setup_logger(__package__)
 
@@ -116,8 +116,8 @@ def _index_command(args: argparse.Namespace):
 
     root = []
     for path in args.root:
-        if _is_glob(path):
-            path = Path(path)
+        if glob.has_magic(path):
+            path = as_path(path)
             paths = list(path.parent.glob(path.name))
             root.extend(paths)
         else:
@@ -171,9 +171,5 @@ def _check_path(path: str):
         sys.exit(1)
 
 
-def _is_glob(path: str) -> bool:
-    return bool(re.search(r"[*?\[\]]", path))
-
-
 if __name__ == "__main__":
     sys.exit(main())
diff --git a/bids2table/_entities.py b/bids2table/_entities.py
@@ -157,11 +157,8 @@ def parse_bids_entities(path: str | Path) -> dict[str, str]:
 
     # Get suffix and extension.
     suffix_ext = parts.pop()
-    idx = suffix_ext.find(".")
-    if idx < 0:
-        suffix, ext = suffix_ext, None
-    else:
-        suffix, ext = suffix_ext[:idx], suffix_ext[idx:]
+    suffix, dot, ext = suffix_ext.partition(".")
+    ext = dot + ext if ext else None
 
     # Suffix is actually an entity, put back in list.
     if "-" in suffix:
diff --git a/bids2table/_indexing.py b/bids2table/_indexing.py
@@ -21,7 +21,7 @@
     validate_bids_entities,
 )
 from ._logging import setup_logger
-from ._pathlib import Path
+from ._pathlib import PathT, as_path
 
 _BIDS_SUBJECT_DIR_PATTERN = re.compile(r"sub-[a-zA-Z0-9]+")
 
@@ -109,7 +109,7 @@ def get_arrow_schema() -> pa.Schema:
     return schema
 
 
-def get_column_names() -> enum.EnumType:
+def get_column_names() -> enum.StrEnum:
     """Get an enum of the BIDS index columns."""
     # TODO: It might be nice if the column names were statically available. One option
     # would be to generate a static _schema.py module at install time (similar to how
@@ -127,11 +127,11 @@ def get_column_names() -> enum.EnumType:
 
 
 def find_bids_datasets(
-    root: str | Path,
+    root: str | PathT,
     exclude: str | list[str] | None = None,
     follow_symlinks: bool = True,
     log_frequency: int = 100,
-) -> Generator[Path, None, None]:
+) -> Generator[PathT, None, None]:
     """Find all BIDS datasets under a root directory.
 
     Args:
@@ -143,8 +143,7 @@ def find_bids_datasets(
     Yields:
         Root paths of all BIDS datasets under `root`.
     """
-    if isinstance(root, str):
-        root = Path(root)
+    root = as_path(root)
 
     dir_count = 0
     ds_count = 0
@@ -178,7 +177,7 @@ def find_bids_datasets(
 
 
 def index_dataset(
-    root: str | Path,
+    root: str | PathT,
     include_subjects: str | list[str] | None = None,
     max_workers: int | None = 0,
     chunksize: int = 32,
@@ -203,8 +202,7 @@ def index_dataset(
     Returns:
         An Arrow table index of the BIDS dataset.
     """
-    if isinstance(root, str):
-        root = Path(root)
+    root = as_path(root)
 
     schema = get_arrow_schema()
 
@@ -243,7 +241,7 @@ def index_dataset(
 
 
 def batch_index_dataset(
-    roots: list[str | Path],
+    roots: list[str | PathT],
     max_workers: int | None = 0,
     executor_cls: type[Executor] = ProcessPoolExecutor,
     show_progress: bool = False,
@@ -275,13 +273,13 @@ def batch_index_dataset(
         yield table
 
 
-def _batch_index_func(root: str | Path) -> tuple[str, pa.Table]:
+def _batch_index_func(root: str | PathT) -> tuple[str | None, pa.Table]:
     dataset, _ = _get_bids_dataset(root)
     table = index_dataset(root, max_workers=0, show_progress=False)
     return dataset, table
 
 
-def _get_bids_dataset(path: str | Path) -> tuple[str | None, Path | None]:
+def _get_bids_dataset(path: str | PathT) -> tuple[str | None, PathT | None]:
     """Get the BIDS dataset that the path belongs to, if any.
 
     Return the dataset directory name and the full dataset path. For nested derivatives
@@ -290,13 +288,10 @@ def _get_bids_dataset(path: str | Path) -> tuple[str | None, Path | None]:
 
     Note that the name is extracted from the path, not the dataset description JSON.
     """
-    if isinstance(path, str):
-        path = Path(path)
-
-    parent = path
+    parent = as_path(path)
     parts: list[str] = []
     scanning = False
-    top_idx = None
+    top_idx = 0
     root = None
 
     while parent.name:
@@ -319,24 +314,24 @@ def _get_bids_dataset(path: str | Path) -> tuple[str | None, Path | None]:
     return dataset, root
 
 
-def _is_bids_dataset(path: Path) -> bool:
+def _is_bids_dataset(path: PathT) -> bool:
     """Test if path is a BIDS dataset root directory."""
     # Check if contains a dataset_description.json or any subject directories. Note,
     # it's common for ppl to forget the dataset description, so let's not be too strict.
     description_exists = (path / "dataset_description.json").exists()
     return description_exists or _contains_bids_subject_dirs(path)
 
 
-def _contains_bids_subject_dirs(root: Path) -> bool:
+def _contains_bids_subject_dirs(root: PathT) -> bool:
     """Check if a path contains one or more BIDS subject dirs."""
     # Nb, this will return on the first matching path thanks to the generator.
     return any(_is_bids_subject_dir(path) for path in root.glob("sub-*"))
 
 
 def _find_bids_subject_dirs(
-    root: Path,
+    root: PathT,
     include_subjects: str | list[str] | None = None,
-) -> list[Path]:
+) -> list[PathT]:
     """Find all BIDS subject dirs contained in a root directory.
 
     Note, only looks one level down. Does not find nested subject directories, e.g. in
@@ -352,7 +347,7 @@ def _find_bids_subject_dirs(
     return paths
 
 
-def _is_bids_subject_dir(path: Path) -> bool:
+def _is_bids_subject_dir(path: PathT) -> bool:
     """Check if a path is a BIDS subject directory."""
     # NOTE: not checking if the path is in fact a directory.
     # This is a slow op, especially on cloud. Can assume that there are no files
@@ -362,7 +357,7 @@ def _is_bids_subject_dir(path: Path) -> bool:
 
 
 def _index_bids_subject_dir(
-    path: Path,
+    path: PathT,
     schema: pa.Schema | None = None,
     dataset: str | None = None,
 ) -> tuple[str, pa.Table]:
@@ -394,7 +389,7 @@ def _index_bids_subject_dir(
     return subject, table
 
 
-def _is_bids_file(path: Path) -> bool:
+def _is_bids_file(path: PathT) -> bool:
     """Check if file is a BIDS file.
 
     Not very exact, but hopefully good enough.
@@ -407,7 +402,8 @@ def _is_bids_file(path: Path) -> bool:
         return False
 
     entities = _cache_parse_bids_entities(path)
-    # if not (entities.get("suffix") and entities.get("datatype")):
+    # If we want to exclude metadata files like *_scans.tsv, we can also check for
+    # datatype.
     if not (entities.get("suffix") and entities.get("ext")):
         return False
 
@@ -422,7 +418,7 @@ def _is_bids_file(path: Path) -> bool:
     return True
 
 
-def _is_bids_json_sidecar(path: Path) -> bool:
+def _is_bids_json_sidecar(path: PathT) -> bool:
     """Quick check if a file is a JSON sidecar."""
     # Quick check if path suffix is not json.
     if path.suffix != ".json":
diff --git a/bids2table/_pathlib.py b/bids2table/_pathlib.py
@@ -1,24 +1,30 @@
 from pathlib import Path
 
 try:
-    # Overshadow pathlib Path.
-    from cloudpathlib import AnyPath as Path
+    from cloudpathlib import AnyPath, CloudPath, S3Client
 
     _CLOUDPATHLIB_AVAILABLE = True
+
+    # Set unsigned client as default for s3:// paths
+    S3Client(no_sign_request=True).set_as_default_client()
+
 except ImportError:
+    AnyPath = CloudPath = Path
+
     _CLOUDPATHLIB_AVAILABLE = False
 
-__all__ = ["Path", "cloudpathlib_is_available"]
+__all__ = ["PathT", "as_path", "cloudpathlib_is_available"]
 
+PathT = Path | CloudPath
 
-def cloudpathlib_is_available() -> bool:
-    """Check if cloudpathlib is available."""
-    return _CLOUDPATHLIB_AVAILABLE
 
+def as_path(path: str | PathT) -> PathT:
+    """Cast input to a `Path` type."""
+    if isinstance(path, str):
+        return AnyPath(path)
+    return path
 
-if _CLOUDPATHLIB_AVAILABLE:
-    # Set unsigned client as default for s3:// paths
-    from cloudpathlib import S3Client
 
-    client = S3Client(no_sign_request=True)
-    client.set_as_default_client()
+def cloudpathlib_is_available() -> bool:
+    """Check if cloudpathlib is available."""
+    return _CLOUDPATHLIB_AVAILABLE
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,7 +36,7 @@ dependencies = [
 
 [project.optional-dependencies]
 s3 = [
-    "cloudpathlib[s3]==0.17.0",
+    "cloudpathlib[s3]>=0.17.0",
 ]
 
 [dependency-groups]
diff --git a/tests/test_indexing.py b/tests/test_indexing.py
@@ -1,7 +1,9 @@
+import logging
 from pathlib import Path
 
 import pyarrow as pa
 import pytest
+from pytest import LogCaptureFixture
 
 import bids2table._indexing as indexing
 from bids2table._pathlib import cloudpathlib_is_available
@@ -72,6 +74,22 @@ def test_index_dataset_parallel(max_workers: int):
     assert len(table) == expected_count
 
 
+@pytest.mark.parametrize(
+    "path,msg",
+    [
+        # Not a bids dataset.
+        ("tools", "not a valid BIDS"),
+        # Has dataset_description.json but no valid subject dirs.
+        ("ieeg_epilepsy/derivatives/brainvisa", "no matching subject"),
+    ],
+)
+def test_index_dataset_warns(path: str, msg: str, caplog: LogCaptureFixture):
+    with caplog.at_level(logging.WARNING):
+        tab = indexing.index_dataset(BIDS_EXAMPLES / path)
+    assert len(tab) == 0
+    assert msg in caplog.text
+
+
 @pytest.mark.parametrize("max_workers", [0, 2])
 def test_batch_index_dataset(max_workers: int):
     datasets = list(indexing.find_bids_datasets(BIDS_EXAMPLES))
@@ -92,6 +110,7 @@ def test_batch_index_dataset(max_workers: int):
 def test_get_bids_dataset(path: str, expected_name: str):
     name, dataset_path = indexing._get_bids_dataset(BIDS_EXAMPLES / path)
     assert name == expected_name
+    assert dataset_path is not None
     assert indexing._contains_bids_subject_dirs(dataset_path)
 
 
@@ -154,6 +173,11 @@ def test_is_bids_subject_dir(path: str, expected: bool):
             "eeg_face13/sub-010/eeg/sub-010_coordsystem.json",
             True,
         ),
+        (
+            # JSON data file with compound extension.
+            "sub-0025428_ses-1_hemi-L_space-native_midthickness.surf.json",
+            True,
+        ),
         (
             # Special case of directory that is a bids "file".
             "ds000247/sub-0007/ses-0001/meg/sub-0007_ses-0001_task-rest_run-01_meg.ds/",
@@ -178,3 +202,18 @@ def test_filter_include_exclude():
     filtered_names = indexing._filter_include(names, include)
     filtered_names = indexing._filter_exclude(filtered_names, exclude)
     assert filtered_names == expected
+
+
+@pytest.mark.parametrize(
+    "num,expected",
+    [
+        (12, "12"),
+        (1234, "1234"),
+        (65432, "65K"),
+        (165432, "165K"),
+        (2165432, "2.2M"),
+        (52165432, "52M"),
+    ],
+)
+def test_h_fmt(num: int, expected: str):
+    assert indexing._hfmt(num) == expected
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -159,5 +159,5 @@`
`159`	`159`	`get_bids_entity_arrow_schema,`
`160`	`160`	`format_bids_path,`
`161`	`161`	`)`
`162`		`-from ._pathlib import Path, cloudpathlib_is_available`
	`162`	`+from ._pathlib import cloudpathlib_is_available`
`163`	`163`	`from ._version import *`
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ dependencies = [`
`36`	`36`
`37`	`37`	`[project.optional-dependencies]`
`38`	`38`	`s3 = [`
`39`		`- "cloudpathlib[s3]==0.17.0",`
	`39`	`+ "cloudpathlib[s3]>=0.17.0",`
`40`	`40`	`]`
`41`	`41`
`42`	`42`	`[dependency-groups]`