Downgrade minimal required python to 3.11 (#51)

clane9 · kaitj · web-flow · commit 28ee47e8687f · 2025-05-28T11:45:54.000-04:00
* Downgrade minimal required python to 3.11

Requiring python&gt;=3.12 is a bit of a burden. It's only required for
`find_bids_datasets`, which uses `Path.walk`. Instead, downgrade minimum
python to 3.11 and add a guard on this function in case python&lt;3.12.

Note, we could look into downgrading further. The next block would be
`get_column_names` returns a `StrEnum`, which was introduced in 3.11.
I'm hesitant to remove this though, because being able to treat these
enum fields as native strings is nice.

* Switch path.walk to os.walk for py311

- Enables use of `finds_bids_datasets` in py311.
  - `root` in `_indexing.py` is initially passed as original type, with walked `dirpath` typecasted as Path
- Removed error for &lt;py312 in throughout codebase + testing

* Setup python matrix for testing

- Only runs after formatting
- Update to dependencies to include other versions of python support

* Implement iterative directory walk for find

`Path.walk` and `CloudPath.walk` depend on python&gt;=3.12. Also,
`CloudPath.walk` retrieves all files up front rather than iteratively.
Here we add some directory walk logic of our own for iteratively finding
BIDS datasets under a root directory.

* Update README.md

* Update module docs

---------

Co-authored-by: Jason Kai &lt;21226986+kaitj@users.noreply.github.com&gt;
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -2,20 +2,20 @@ name: CI
 
 on:
   push:
-    branches: [ "main" ]
+    branches: ["main"]
   pull_request:
-    branches: [ "main" ]
+    branches: ["main"]
 
 env:
   UV_FROZEN: true
 
 jobs:
-  test:
+  format:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
         with:
-          submodules: 'true'
+          submodules: "true"
       - name: Install uv
         uses: astral-sh/setup-uv@v5
         with:
@@ -26,9 +26,24 @@ jobs:
         run: |
           uv run ruff check bids2table tests
           uv run ruff format --check bids2table tests
+
+  tests:
+    runs-on: ubuntu-latest
+    needs: format
+    strategy:
+      matrix:
+        python-version: ["3.11", "3.12", "3.13"]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "true"
+      - name: Install uv with python version
+        uses: astral-sh/setup-uv@v6
+        with:
+          python-version: ${{ matrix.python-version }}
       - name: Run tests
         run: |
-          uv run pytest \
+          uv run --all-extras pytest \
             --junitxml=pytest.xml \
             --cov-report=xml:coverage.xml \
             --cov=bids2table tests
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![Docs](https://github.com/childmindresearch/bids2table/actions/workflows/docs.yaml/badge.svg?branch=main)](https://childmindresearch.github.io/bids2table/bids2table)
 [![codecov](https://codecov.io/gh/childmindresearch/bids2table/branch/main/graph/badge.svg?token=22HWWFWPW5)](https://codecov.io/gh/childmindresearch/bids2table)
 [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
-![Python3](https://img.shields.io/badge/python->=3.12-blue.svg)
+![Python3](https://img.shields.io/badge/python->=3.11-blue.svg)
 [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
 
 Index [BIDS](https://bids-specification.readthedocs.io/en/stable/) datasets fast, locally or in the cloud.
diff --git a/bids2table/__init__.py b/bids2table/__init__.py
@@ -4,7 +4,7 @@
 [![Docs](https://github.com/childmindresearch/bids2table/actions/workflows/docs.yaml/badge.svg?branch=main)](https://childmindresearch.github.io/bids2table/bids2table)
 [![codecov](https://codecov.io/gh/childmindresearch/bids2table/branch/main/graph/badge.svg?token=22HWWFWPW5)](https://codecov.io/gh/childmindresearch/bids2table)
 [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
-![Python3](https://img.shields.io/badge/python->=3.12-blue.svg)
+![Python3](https://img.shields.io/badge/python->=3.11-blue.svg)
 [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
 
 Index [BIDS](https://bids-specification.readthedocs.io/en/stable/) datasets fast, locally or in the cloud.
diff --git a/bids2table/__main__.py b/bids2table/__main__.py
@@ -67,6 +67,9 @@ def main():
     parser_index.set_defaults(func=_index_command)
 
     parser_find = subparsers.add_parser("find", help="Find BIDS datasets.")
+    parser_find.add_argument(
+        "--maxdepth", type=int, help="Max search depth", default=None
+    )
     parser_find.add_argument(
         "--exclude-dirs",
         metavar="DIR",
@@ -75,12 +78,6 @@ def main():
         default=None,
         help="List of directory names or glob patterns to exclude from search.",
     )
-    parser_find.add_argument(
-        "--follow-symlinks",
-        "-L",
-        action="store_true",
-        help="Follow symbolic links.",
-    )
     parser_find.add_argument(
         "--verbose",
         "-v",
@@ -157,7 +154,7 @@ def _find_command(args: argparse.Namespace):
     for dataset in b2t2.find_bids_datasets(
         args.root,
         exclude=args.exclude_dirs,
-        follow_symlinks=args.follow_symlinks,
+        maxdepth=args.maxdepth,
     ):
         print(dataset)
 
diff --git a/bids2table/_indexing.py b/bids2table/_indexing.py
@@ -129,51 +129,66 @@ def get_column_names() -> enum.StrEnum:
 def find_bids_datasets(
     root: str | PathT,
     exclude: str | list[str] | None = None,
-    follow_symlinks: bool = True,
-    log_frequency: int = 100,
+    maxdepth: int | None = None,
 ) -> Generator[PathT, None, None]:
     """Find all BIDS datasets under a root directory.
 
     Args:
         root: Root path to begin search.
         exclude: Glob pattern or list of patterns matching sub-directory names to
             exclude from the search.
-        follow_symlinks: Search into symlinks that point to directories.
+        maxdepth: Maximum depth to search.
 
     Yields:
         Root paths of all BIDS datasets under `root`.
     """
     root = as_path(root)
 
-    dir_count = 0
+    if isinstance(exclude, str):
+        exclude = [exclude]
+    elif exclude is None:
+        exclude = []
+    exclude = [re.compile(fnmatch.translate(pat)) for pat in exclude]
+
+    entry_count = 1
     ds_count = 0
 
-    # NOTE: Path.walk was introduced in 3.12. Otherwise, could use an older python.
-    for dirpath, dirnames, _ in root.walk(follow_symlinks=follow_symlinks):
-        dir_count += 1
+    if _is_bids_dataset(root):
+        ds_count += 1
+        yield root
 
-        if _is_bids_dataset(dirpath):
-            ds_count += 1
-            yield dirpath
+    # Tuple of path, depth
+    stack = [(root, 0)]
 
-            # Only descend into specific sub-directories that are allowed to contain
-            # sub-datasets.
-            _filter_dirnames(dirnames, _BIDS_NESTED_PARENT_DIRNAMES)
+    while stack:
+        top, depth = stack.pop()
 
-        # Filter sub-directories to descend into.
-        if exclude:
-            matches = _filter_exclude(dirnames, exclude)
-            _filter_dirnames(dirnames, matches)
+        inside_bids = _is_bids_dataset(top)
+        depth += 1
 
-        if log_frequency and dir_count % log_frequency == 0:
-            _logger.info(
-                "Searched %d directories; found %d BIDS datasets.", dir_count, ds_count
-            )
+        for entry in top.iterdir():
+            entry_count += 1
 
-    if log_frequency:
-        _logger.info(
-            "Searched %d directories; found %d BIDS datasets.", dir_count, ds_count
-        )
+            if any(re.fullmatch(pat, entry.name) for pat in exclude):
+                continue
+
+            if _is_bids_dataset(entry):
+                ds_count += 1
+                yield entry
+
+            # Checks if we should descend into this directory.
+            # Check not reached final depth.
+            descend = maxdepth is None or depth < maxdepth
+            # Heuristic checks whether the filename looks like a (visible) directory.
+            descend = descend and not (entry.suffix or entry.name.startswith("."))
+            # Only descend into specific subdirectories of BIDS directories.
+            descend = descend and (
+                not inside_bids or entry.name in _BIDS_NESTED_PARENT_DIRNAMES
+            )
+            # Finally, check if actually a directory (which is slow so we want to
+            # short-circuit as much as possible).
+            if descend and entry.is_dir():
+                stack.append((entry, depth))
 
 
 def index_dataset(
@@ -316,6 +331,17 @@ def _get_bids_dataset(path: str | PathT) -> tuple[str | None, PathT | None]:
 
 def _is_bids_dataset(path: PathT) -> bool:
     """Test if path is a BIDS dataset root directory."""
+    # Quick heuristic checks.
+    # BIDS datasets should not contain a file extension.
+    if path.suffix:
+        return False
+    # Path should not be hidden.
+    if path.name.startswith("."):
+        return False
+    # Subject dirs are not datasets.
+    if _is_bids_subject_dir(path):
+        return False
+
     # Check if contains a dataset_description.json or any subject directories. Note,
     # it's common for ppl to forget the dataset description, so let's not be too strict.
     description_exists = (path / "dataset_description.json").exists()
@@ -493,15 +519,6 @@ def _multi_pattern_filter(names: list[str], patterns: str | list[str]) -> set[st
     return matching_names
 
 
-def _filter_dirnames(dirnames: list[str], matches: set[str]) -> None:
-    """Remove dirnames matching `matches` in place."""
-    # Iterate in reversed order since we are modifying in place.
-    n_names = len(dirnames)
-    for ii, dirname in enumerate(reversed(dirnames)):
-        if dirname not in matches:
-            del dirnames[n_names - ii - 1]
-
-
 def _hfmt(n: int) -> str:
     if n < 10_000:
         n_fmt = str(n)
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,13 +5,11 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "bids2table"
 dynamic = ["version"]
-authors = [
-    { name = "Connor Lane", email = "connor.lane858@gmail.com" },
-]
+authors = [{ name = "Connor Lane", email = "connor.lane858@gmail.com" }]
 description = "Index BIDS datasets fast, locally or in the cloud."
 readme = "README.md"
-requires-python = ">=3.12"
-license = {text = "MIT License"}
+requires-python = ">=3.11"
+license = { text = "MIT License" }
 classifiers = [
     "Development Status :: 3 - Alpha",
     "Intended Audience :: Developers",
@@ -20,24 +18,17 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3.13",
-    "Programming Language :: Python :: 3.14",
     "License :: OSI Approved :: MIT License",
     "Operating System :: POSIX",
     "Operating System :: Unix",
     "Operating System :: MacOS",
     "Operating System :: Microsoft :: Windows",
 ]
 
-dependencies = [
-    "bidsschematools>=1.0",
-    "pyarrow>=14.0.2",
-    "tqdm>=4.66.2",
-]
+dependencies = ["bidsschematools>=1.0", "pyarrow>=20.0.0", "tqdm>=4.67.1"]
 
 [project.optional-dependencies]
-s3 = [
-    "cloudpathlib[s3]>=0.17.0",
-]
+s3 = ["cloudpathlib[s3]>=0.21.0"]
 
 [dependency-groups]
 dev = [
@@ -48,7 +39,7 @@ dev = [
     "pre-commit>=4.1.0",
     "pytest>=8.3.5",
     "pytest-cov>=6.0.0",
-    "ruff>=0.9.10",
+    "ruff>=0.11.9",
 ]
 
 [project.urls]
diff --git a/tests/test_indexing.py b/tests/test_indexing.py
@@ -1,4 +1,5 @@
 import logging
+from itertools import islice
 from pathlib import Path
 
 import pyarrow as pa
@@ -25,24 +26,43 @@ def test_get_column_names():
 
 
 def test_find_bids_datasets():
-    datasets = sorted(indexing.find_bids_datasets(BIDS_EXAMPLES, log_frequency=100))
+    datasets = sorted(
+        indexing.find_bids_datasets(
+            BIDS_EXAMPLES,
+            exclude=["surfaces", "subjects", "code", "sourcedata"],
+        )
+    )
     expected_datasets = sorted(
         [p.parent for p in BIDS_EXAMPLES.rglob("dataset_description.json")]
     )
     # find_bids_datasets finds a few extra derivative datasets that are missing a
     # dataset_description.json.
     assert set(expected_datasets).issubset(datasets)
-    assert len(datasets) == len(expected_datasets) + 6
+    assert len(datasets) == len(expected_datasets) + 3
 
     datasets_no_derivatives = sorted(
-        indexing.find_bids_datasets(BIDS_EXAMPLES, exclude="derivatives")
+        indexing.find_bids_datasets(
+            BIDS_EXAMPLES,
+            exclude=["derivatives", "code", "sourcedata"],
+        )
     )
     expected_datasets_no_derivatives = sorted(
         [p.parent for p in BIDS_EXAMPLES.glob("*/dataset_description.json")]
     )
     assert datasets_no_derivatives == expected_datasets_no_derivatives
 
 
+def test_find_bids_datasets_s3():
+    root = "s3://openneuro.org"
+    datasets = list(islice(indexing.find_bids_datasets(root, maxdepth=2), 10))
+    names = sorted([ds.name for ds in datasets])
+    expected_names = [
+        "ds000001", "ds000002", "ds000003", "ds000005", "ds000006",
+        "ds000007", "ds000008", "ds000009", "ds000011", "ds000017",
+    ]  # fmt: skip
+    assert names == expected_names
+
+
 @pytest.mark.parametrize(
     "root,expected_count",
     [
@@ -92,12 +112,12 @@ def test_index_dataset_warns(path: str, msg: str, caplog: LogCaptureFixture):
 
 @pytest.mark.parametrize("max_workers", [0, 2])
 def test_batch_index_dataset(max_workers: int):
-    datasets = list(indexing.find_bids_datasets(BIDS_EXAMPLES))
+    datasets = list(BIDS_EXAMPLES.glob("*"))
     tables = indexing.batch_index_dataset(
         datasets, max_workers=max_workers, show_progress=False
     )
     table = pa.concat_tables(tables)
-    assert len(table) == 10133
+    assert len(table) == 9727
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -10,13 +10,6 @@
 
 BIDS_EXAMPLES = Path(__file__).parents[1] / "bids-examples"
 
-COMMANDS = [
-    "find {examples}",
-    "index -o {out_dir}/ds102.parquet {examples}/ds102",
-    "index -o {out_dir}/ds101_ds102.parquet {examples}/ds101 {examples}/ds102",
-    "index -o {out_dir}/ds10N.parquet '{examples}/ds10?'",
-]
-
 
 @contextmanager
 def patch_argv(argv: List[str]):
@@ -31,7 +24,6 @@ def patch_argv(argv: List[str]):
 @pytest.mark.parametrize(
     "cmd,output",
     [
-        ("find {examples}", None),
         ("index -o {out_dir}/ds102.parquet {examples}/ds102", "ds102.parquet"),
         (
             "index -o {out_dir}/ds101_ds102.parquet {examples}/ds101 {examples}/ds102",
@@ -40,7 +32,7 @@ def patch_argv(argv: List[str]):
         ("index -o {out_dir}/ds10N.parquet '{examples}/ds10?'", "ds10N.parquet"),
     ],
 )
-def test_main(cmd: str, output: str | None, tmp_path: Path):
+def test_main_index(cmd: str, output: str | None, tmp_path: Path):
     cmd_fmt = cmd.format(out_dir=tmp_path, examples=BIDS_EXAMPLES)
     prog = str(Path(cli.__file__).absolute())
     argv = [prog] + shlex.split(cmd_fmt)
@@ -49,3 +41,12 @@ def test_main(cmd: str, output: str | None, tmp_path: Path):
 
     if output:
         assert (tmp_path / output).exists()
+
+
+@pytest.mark.parametrize("cmd", ["find {examples}"])
+def test_main_find(cmd: str):
+    cmd_fmt = cmd.format(examples=BIDS_EXAMPLES)
+    prog = str(Path(cli.__file__).absolute())
+    argv = [prog] + shlex.split(cmd_fmt)
+    with patch_argv(argv):
+        cli.main()
diff --git a/uv.lock b/uv.lock