Merge pull request #67 from childmindresearch/enh/cloud-gs

kaitj · web-flow · commit c1d67b2fadbd · 2026-04-28T13:46:48.000-04:00
Add gs support
diff --git a/README.md b/README.md
@@ -16,16 +16,19 @@ To install the latest release from pypi, you can run
 pip install bids2table
 ```
 
-To install with S3 support, include the `s3` extra
+To install with cloud support, include the `cloud` extra
 
 ```sh
-pip install bids2table[s3]
+pip install bids2table[cloud]
 ```
 
+> [!WARNING]
+> Previous version only supported s3. s3 installation is still supported, but will be deprecated in the next version. Please update any installation scripts.
+
 The latest development version can be installed with
 
 ```sh
-pip install "bids2table[s3] @ git+https://github.com/childmindresearch/bids2table.git"
+pip install "bids2table[cloud] @ git+https://github.com/childmindresearch/bids2table.git"
 ```
 
 ## Usage
diff --git a/bids2table/__init__.py b/bids2table/__init__.py
@@ -1,133 +1,4 @@
-# ruff: noqa: I001
-r"""
-[![CI](https://github.com/childmindresearch/bids2table/actions/workflows/ci.yaml/badge.svg?branch=main)](https://github.com/childmindresearch/bids2table/actions/workflows/ci.yaml?query=branch%3Amain)
-[![Docs](https://github.com/childmindresearch/bids2table/actions/workflows/docs.yaml/badge.svg?branch=main)](https://childmindresearch.github.io/bids2table/bids2table)
-[![codecov](https://codecov.io/gh/childmindresearch/bids2table/branch/main/graph/badge.svg?token=22HWWFWPW5)](https://codecov.io/gh/childmindresearch/bids2table)
-[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
-![Python3](https://img.shields.io/badge/python->=3.11-blue.svg)
-[![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
-
-Index [BIDS](https://bids-specification.readthedocs.io/en/stable/) datasets fast, locally or in the cloud.
-
-## Installation
-
-To install the latest release from pypi, you can run
-
-```sh
-pip install bids2table
-```
-
-To install with S3 support, include the `s3` extra
-
-```sh
-pip install bids2table[s3]
-```
-
-The latest development version can be installed with
-
-```sh
-pip install "bids2table[s3] @ git+https://github.com/childmindresearch/bids2table.git"
-```
-
-## Usage
-
-To run these examples, you will need to clone the [bids-examples](https://github.com/bids-standard/bids-examples) repo.
-
-```sh
-git clone -b 1.9.0 https://github.com/bids-standard/bids-examples.git
-```
-
-### Finding BIDS datasets
-
-You can search a directory for valid BIDS datasets using `b2t2 find`
-
-```
-(bids2table) clane$ b2t2 find bids-examples | head -n 10
-bids-examples/asl002
-bids-examples/ds002
-bids-examples/ds005
-bids-examples/asl005
-bids-examples/ds051
-bids-examples/eeg_rishikesh
-bids-examples/asl004
-bids-examples/asl003
-bids-examples/ds003
-bids-examples/eeg_cbm
-```
-
-### Indexing datasets from the command line
-
-Indexing datasets is done with `b2t2 index`. Here we index a single example dataset, saving the output as a parquet file.
-
-```
-(bids2table) clane$ b2t2 index -o ds102.parquet bids-examples/ds102
-ds102: 100%|███████████████████████████████████████| 26/26 [00:00<00:00, 154.12it/s, sub=26, N=130]
-```
-
-You can also index a list of datasets. Note that each iteration in the progress bar represents one dataset.
-
-```
-(bids2table) clane$ b2t2 index -o bids-examples.parquet bids-examples/*
-100%|████████████████████████████████████████████| 87/87 [00:00<00:00, 113.59it/s, ds=None, N=9727]
-```
-
-You can pipe the output of `b2t2 find` to `b2t2 index` to create an index of all datasets under a root directory.
-
-```
-(bids2table) clane$ b2t2 find bids-examples | b2t2 index -o bids-examples.parquet
-97it [00:01, 96.05it/s, ds=ieeg_filtered_speech, N=10K]
-```
-
-The resulting index will include both top-level datasets (as in the previous command) as well nested derivatives datasets.
-
-### Indexing datasets hosted on S3
-
-bids2table supports indexing datasets hosted on S3 via [cloudpathlib](https://github.com/drivendataorg/cloudpathlib). To use this functionality, make sure to install bids2table with the `s3` extra. Or you can also just install cloudpathlib directly
-
-```sh
-pip install cloudpathlib[s3]
-```
-
-As an example, here we index all datasets on [OpenNeuro](https://openneuro.org/)
-
-```
-(bids2table) clane$ b2t2 index -o openneuro.parquet \
-  -j 8 --use-threads s3://openneuro.org/ds*
-100%|█████████████████████████████████████| 1408/1408 [12:25<00:00,  1.89it/s, ds=ds006193, N=1.2M]
-```
-
-Using 8 threads, we can index all ~1400 OpenNeuro datasets (1.2M files) in less than 15 minutes.
-
-
-### Indexing datasets from python
-
-You can also index datasets using the Python API.
-
-```python
-import bids2table as b2t2
-import pandas as pd
-import pyarrow as pa
-import pyarrow.parquet as pq
-
-# Index a single dataset.
-tab = b2t2.index_dataset("bids-examples/ds102")
-
-# Find and index a batch of datasets.
-tabs = b2t2.batch_index_dataset(
-    b2t2.find_bids_datasets("bids-examples"),
-)
-tab = pa.concat_tables(tabs)
-
-# Index a dataset on S3.
-tab = b2t2.index_dataset("s3://openneuro.org/ds000224")
-
-# Save as parquet.
-pq.write_table(tab, "ds000224.parquet")
-
-# Convert to a pandas dataframe.
-df = tab.to_pandas(types_mapper=pd.ArrowDtype)
-```
-"""
+""".. include:: ../README.md"""  # noqa: D415
 
 __all__ = [
     "index_dataset",
@@ -145,20 +16,20 @@
     "cloudpathlib_is_available",
 ]
 
+from ._entities import (
+    format_bids_path,
+    get_bids_entity_arrow_schema,
+    get_bids_schema,
+    parse_bids_entities,
+    set_bids_schema,
+    validate_bids_entities,
+)
 from ._indexing import (
-    index_dataset,
     batch_index_dataset,
     find_bids_datasets,
     get_arrow_schema,
     get_column_names,
-)
-from ._entities import (
-    parse_bids_entities,
-    validate_bids_entities,
-    set_bids_schema,
-    get_bids_schema,
-    get_bids_entity_arrow_schema,
-    format_bids_path,
+    index_dataset,
 )
 from ._metadata import load_bids_metadata
 from ._pathlib import cloudpathlib_is_available
diff --git a/bids2table/__main__.py b/bids2table/__main__.py
@@ -160,10 +160,10 @@ def _find_command(args: argparse.Namespace):
 
 
 def _check_path(path: str):
-    if path.startswith("s3://") and not b2t2.cloudpathlib_is_available():
+    if path.startswith(("s3://", "gs://")) and not b2t2.cloudpathlib_is_available():
         _logger.error(
-            "Cloudpathlib is required to use S3 paths. "
-            "Install with e.g. `pip install cloudpathlib[s3]`."
+            "Cloudpathlib is required to use cloud paths. "
+            "Install with e.g. `pip install cloudpathlib[cloud]`."
         )
         sys.exit(1)
 
diff --git a/bids2table/_pathlib.py b/bids2table/_pathlib.py
@@ -1,12 +1,13 @@
 from pathlib import Path
 
 try:
-    from cloudpathlib import AnyPath, CloudPath, S3Client
+    from cloudpathlib import AnyPath, CloudPath, GSClient, S3Client
 
     _CLOUDPATHLIB_AVAILABLE = True
 
-    # Set unsigned client as default for s3:// paths
+    # Set default clients for cloud paths
     S3Client(no_sign_request=True).set_as_default_client()
+    GSClient().set_as_default_client()
 
 except ImportError:
     AnyPath = CloudPath = Path
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,21 +25,22 @@ classifiers = [
     "Operating System :: Microsoft :: Windows",
 ]
 
-dependencies = ["bidsschematools>=1.0", "pyarrow>=20.0.0", "tqdm>=4.67.1"]
+dependencies = ["bidsschematools>=1.0", "pyarrow>=24.0.0", "tqdm>=4.67.3"]
 
 [project.optional-dependencies]
-s3 = ["cloudpathlib[s3]>=0.21.0"]
+cloud = ["cloudpathlib[s3,gs]>=0.21.0"]
+s3 = [
+    "cloudpathlib[s3]>=0.21.0",
+] # Include s3 to not break backwards compatibility
 
 [dependency-groups]
 dev = [
-    "ipython>=9.2.0",
-    "jupyter>=1.1.1",
-    "pandas==2.2.3",
-    "pdoc>=15.0.3",
-    "pre-commit>=4.1.0",
-    "pytest>=8.3.5",
-    "pytest-cov>=6.0.0",
-    "ruff>=0.11.9",
+    "pandas==3.0.2",
+    "pdoc>=16.0.0",
+    "pre-commit>=4.6.0",
+    "pytest>=9.0.3",
+    "pytest-cov>=7.1.0",
+    "ruff>=0.15.12",
 ]
 
 [project.urls]
diff --git a/tests/test_indexing.py b/tests/test_indexing.py
@@ -15,7 +15,7 @@
 def test_get_arrow_schema():
     schema = indexing.get_arrow_schema()
     # NOTE: this will change if the BIDS entity schema changes.
-    assert len(schema) == 38
+    assert len(schema) == 42
 
 
 def test_get_column_names():
diff --git a/uv.lock b/uv.lock