childmindresearch · clane9 · May 6, 2025 · May 6, 2025 · May 6, 2025 · May 6, 2025
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -0,0 +1,46 @@
+name: docs 
+
+on:
+  workflow_run:
+    workflows: [CI]
+    types:
+    - completed
+    branches:
+    - main
+
+env:
+  UV_FROZEN: true
+
+jobs:
+  # Build the documentation and upload the static HTML files as an artifact.
+  build:
+    if: ${{ github.event.workflow_run.conclusion == 'success' }}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          pyproject-file: pyproject.toml
+      - name: Install the project
+        run: uv sync --all-extras
+      - name: Build docs
+        run: pdoc -o docs/ -d google bids2table
+      - uses: actions/upload-pages-artifact@v3
+        with:
+          path: docs/
+
+  # Deploy the artifact to GitHub pages.
+  # This is a separate job so that only actions/deploy-pages has the necessary permissions.
+  deploy:
+    needs: build
+    runs-on: ubuntu-latest
+    permissions:
+      pages: write
+      id-token: write
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    steps:
+      - id: deployment
+        uses: actions/deploy-pages@v4
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,7 @@ __pycache__/
 *.egg-info/
 dist
 build
+docs
 */_version.py
 
 # Unit test / coverage reports

diff --git a/README.md b/README.md
@@ -1,11 +1,12 @@
 # bids2table
 [![CI](https://github.com/childmindresearch/bids2table/actions/workflows/ci.yaml/badge.svg?branch=main)](https://github.com/childmindresearch/bids2table/actions/workflows/ci.yaml?query=branch%3Amain)
+[![Docs](https://github.com/childmindresearch/bids2table/actions/workflows/docs.yaml/badge.svg?branch=main)](https://childmindresearch.github.io/bids2table/bids2table)
 [![codecov](https://codecov.io/gh/childmindresearch/bids2table/branch/main/graph/badge.svg?token=22HWWFWPW5)](https://codecov.io/gh/childmindresearch/bids2table)
 [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
 ![Python3](https://img.shields.io/badge/python->=3.12-blue.svg)
 [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
 
-Index BIDS datasets fast, locally or in the cloud.
+Index [BIDS](https://bids-specification.readthedocs.io/en/stable/) datasets fast, locally or in the cloud.
 
 ## Installation
 
@@ -102,8 +103,10 @@ Using 8 threads, we can index all ~1400 OpenNeuro datasets (1.2M files) in less
 You can also index datasets using the Python API.
 
 ```python
-import pyarrow as pa
 import bids2table as b2t2
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
 
 # Index a single dataset.
 tab = b2t2.index_dataset("bids-examples/ds102")
@@ -116,4 +119,10 @@ tab = pa.concat_tables(tabs)
 
 # Index a dataset on S3.
 tab = b2t2.index_dataset("s3://openneuro.org/ds000224")
+
+# Save as parquet.
+pq.write_table(tab, "ds000224.parquet")
+
+# Convert to a pandas dataframe.
+df = tab.to_pandas(types_mapper=pd.ArrowDtype)
 ```
diff --git a/bids2table/__init__.py b/bids2table/__init__.py
@@ -1,6 +1,156 @@
 # ruff: noqa: I001
-"""Index BIDS datasets fast, locally or in the cloud."""
+r"""
+[![CI](https://github.com/childmindresearch/bids2table/actions/workflows/ci.yaml/badge.svg?branch=main)](https://github.com/childmindresearch/bids2table/actions/workflows/ci.yaml?query=branch%3Amain)
+[![Docs](https://github.com/childmindresearch/bids2table/actions/workflows/docs.yaml/badge.svg?branch=main)](https://childmindresearch.github.io/bids2table/bids2table)
+[![codecov](https://codecov.io/gh/childmindresearch/bids2table/branch/main/graph/badge.svg?token=22HWWFWPW5)](https://codecov.io/gh/childmindresearch/bids2table)
+[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
+![Python3](https://img.shields.io/badge/python->=3.12-blue.svg)
+[![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
 
+Index [BIDS](https://bids-specification.readthedocs.io/en/stable/) datasets fast, locally or in the cloud.
+
+## Installation
+
+To install the latest release from pypi, you can run
+
+```sh
+pip install bids2table
+```
+
+To install with S3 support, include the `s3` extra
+
+```sh
+pip install bids2table[s3]
+```
+
+The latest development version can be installed with
+
+```sh
+pip install "bids2table[s3] @ git+https://github.com/childmindresearch/bids2table.git"
+```
+
+## Usage
+
+To run these examples, you will need to clone the [bids-examples](https://github.com/bids-standard/bids-examples) repo.
+
+```sh
+git clone -b 1.9.0 https://github.com/bids-standard/bids-examples.git
+```
+
+### Finding BIDS datasets
+
+You can search a directory for valid BIDS datasets using `b2t2 find`
+
+```
+(bids2table) clane$ b2t2 find bids-examples | head -n 10
+bids-examples/asl002
+bids-examples/ds002
+bids-examples/ds005
+bids-examples/asl005
+bids-examples/ds051
+bids-examples/eeg_rishikesh
+bids-examples/asl004
+bids-examples/asl003
+bids-examples/ds003
+bids-examples/eeg_cbm
+```
+
+### Indexing datasets from the command line
+
+Indexing datasets is done with `b2t2 index`. Here we index a single example dataset, saving the output as a parquet file.
+
+```
+(bids2table) clane$ b2t2 index -o ds102.parquet bids-examples/ds102
+ds102: 100%|███████████████████████████████████████| 26/26 [00:00<00:00, 154.12it/s, sub=26, N=130]
+```
+
+You can also index a list of datasets. Note that each iteration in the progress bar represents one dataset.
+
+```
+(bids2table) clane$ b2t2 index -o bids-examples.parquet bids-examples/*
+100%|████████████████████████████████████████████| 87/87 [00:00<00:00, 113.59it/s, ds=None, N=9727]
+```
+
+You can pipe the output of `b2t2 find` to `b2t2 index` to create an index of all datasets under a root directory.
+
+```
+(bids2table) clane$ b2t2 find bids-examples | b2t2 index -o bids-examples.parquet
+97it [00:01, 96.05it/s, ds=ieeg_filtered_speech, N=10K]
+```
+
+The resulting index will include both top-level datasets (as in the previous command) as well nested derivatives datasets.
+
+### Indexing datasets hosted on S3
+
+bids2table supports indexing datasets hosted on S3 via [cloudpathlib](https://github.com/drivendataorg/cloudpathlib). To use this functionality, make sure to install bids2table with the `s3` extra. Or you can also just install cloudpathlib directly
+
+```sh
+pip install cloudpathlib[s3]
+```
+
+As an example, here we index all datasets on [OpenNeuro](https://openneuro.org/)
+
+```
+(bids2table) clane$ b2t2 index -o openneuro.parquet \
+  -j 8 --use-threads s3://openneuro.org/ds*
+100%|█████████████████████████████████████| 1408/1408 [12:25<00:00,  1.89it/s, ds=ds006193, N=1.2M]
+```
+
+Using 8 threads, we can index all ~1400 OpenNeuro datasets (1.2M files) in less than 15 minutes.
+
+
+### Indexing datasets from python
+
+You can also index datasets using the Python API.
+
+```python
+import bids2table as b2t2
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+# Index a single dataset.
+tab = b2t2.index_dataset("bids-examples/ds102")
+
+# Find and index a batch of datasets.
+tabs = b2t2.batch_index_dataset(
+    b2t2.find_bids_datasets("bids-examples"),
+)
+tab = pa.concat_tables(tabs)
+
+# Index a dataset on S3.
+tab = b2t2.index_dataset("s3://openneuro.org/ds000224")
+
+# Save as parquet.
+pq.write_table(tab, "ds000224.parquet")
+
+# Convert to a pandas dataframe.
+df = tab.to_pandas(types_mapper=pd.ArrowDtype)
+```
+"""
+
+__all__ = [
+    "index_dataset",
+    "batch_index_dataset",
+    "find_bids_datasets",
+    "get_arrow_schema",
+    "get_column_names",
+    "parse_bids_entities",
+    "validate_bids_entities",
+    "set_bids_schema",
+    "get_bids_schema",
+    "get_bids_entity_arrow_schema",
+    "format_bids_path",
+    "cloudpathlib_is_available",
+]
+
+from ._indexing import (
+    index_dataset,
+    batch_index_dataset,
+    find_bids_datasets,
+    get_arrow_schema,
+    get_column_names,
+)
 from ._entities import (
     parse_bids_entities,
     validate_bids_entities,
@@ -9,12 +159,5 @@
     get_bids_entity_arrow_schema,
     format_bids_path,
 )
-from ._indexing import (
-    find_bids_datasets,
-    index_dataset,
-    batch_index_dataset,
-    get_arrow_schema,
-    get_column_names,
-)
 from ._pathlib import Path, cloudpathlib_is_available
 from ._version import *
diff --git a/bids2table/_entities.py b/bids2table/_entities.py
@@ -144,7 +144,7 @@ def parse_bids_entities(path: str | Path) -> dict[str, str]:
         path: BIDS path to parse.
 
     Returns:
-        entities: dict mapping BIDS entity keys to values.
+        A dict mapping BIDS entity keys to values.
     """
     if isinstance(path, str):
         path = Path(path)
@@ -207,9 +207,10 @@ def validate_bids_entities(
         entities: dict mapping BIDS keys to unvalidated entities
 
     Returns:
-        valid_entities: A mapping of valid BIDS keys to type-casted values.
-        extra_entities: A mapping of any leftover entity mappings that didn't match a
-            known entity or failed validation.
+        A tuple of `(valid_entities, extra_entities)`, where `valid_entities` is a
+            mapping of valid BIDS keys to type-casted values, and `extra_entities` a
+            mapping of any leftover entity mappings that didn't match a known entity or
+            failed validation.
     """
     valid_entities = {}
     extra_entities = {}
@@ -254,7 +255,7 @@ def format_bids_path(entities: dict[str, Any], int_format: str = "%d") -> Path:
         int_format: format string for integer (index) BIDS values.
 
     Returns:
-        path: formatted `Path` instance.
+        A formatted `Path` instance.
     """
     special = {"datatype", "suffix", "ext"}
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -44,6 +44,7 @@ dev = [
     "ipython>=9.2.0",
     "jupyter>=1.1.1",
     "pandas==2.2.3",
+    "pdoc>=15.0.3",
     "pre-commit>=4.1.0",
     "pytest>=8.3.5",
     "pytest-cov>=6.0.0",

diff --git a/uv.lock b/uv.lock
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ __pycache__/ @@
     *.egg-info/
     dist
     build
+    docs
     */_version.py
     # Unit test / coverage reports
@@ Expand Down @@