|
1 | 1 | # ruff: noqa: I001 |
2 | | -"""Index BIDS datasets fast, locally or in the cloud.""" |
| 2 | +r""" |
| 3 | +[](https://github.com/childmindresearch/bids2table/actions/workflows/ci.yaml?query=branch%3Amain) |
| 4 | +[](https://childmindresearch.github.io/bids2table/bids2table) |
| 5 | +[](https://codecov.io/gh/childmindresearch/bids2table) |
| 6 | +[](https://github.com/astral-sh/ruff) |
| 7 | + |
| 8 | +[](LICENSE) |
3 | 9 |
|
| 10 | +Index [BIDS](https://bids-specification.readthedocs.io/en/stable/) datasets fast, locally or in the cloud. |
| 11 | +
|
| 12 | +## Installation |
| 13 | +
|
| 14 | +To install the latest release from pypi, you can run |
| 15 | +
|
| 16 | +```sh |
| 17 | +pip install bids2table |
| 18 | +``` |
| 19 | +
|
| 20 | +To install with S3 support, include the `s3` extra |
| 21 | +
|
| 22 | +```sh |
| 23 | +pip install bids2table[s3] |
| 24 | +``` |
| 25 | +
|
| 26 | +The latest development version can be installed with |
| 27 | +
|
| 28 | +```sh |
| 29 | +pip install "bids2table[s3] @ git+https://github.com/childmindresearch/bids2table.git" |
| 30 | +``` |
| 31 | +
|
| 32 | +## Usage |
| 33 | +
|
| 34 | +To run these examples, you will need to clone the [bids-examples](https://github.com/bids-standard/bids-examples) repo. |
| 35 | +
|
| 36 | +```sh |
| 37 | +git clone -b 1.9.0 https://github.com/bids-standard/bids-examples.git |
| 38 | +``` |
| 39 | +
|
| 40 | +### Finding BIDS datasets |
| 41 | +
|
| 42 | +You can search a directory for valid BIDS datasets using `b2t2 find` |
| 43 | +
|
| 44 | +``` |
| 45 | +(bids2table) clane$ b2t2 find bids-examples | head -n 10 |
| 46 | +bids-examples/asl002 |
| 47 | +bids-examples/ds002 |
| 48 | +bids-examples/ds005 |
| 49 | +bids-examples/asl005 |
| 50 | +bids-examples/ds051 |
| 51 | +bids-examples/eeg_rishikesh |
| 52 | +bids-examples/asl004 |
| 53 | +bids-examples/asl003 |
| 54 | +bids-examples/ds003 |
| 55 | +bids-examples/eeg_cbm |
| 56 | +``` |
| 57 | +
|
| 58 | +### Indexing datasets from the command line |
| 59 | +
|
| 60 | +Indexing datasets is done with `b2t2 index`. Here we index a single example dataset, saving the output as a parquet file. |
| 61 | +
|
| 62 | +``` |
| 63 | +(bids2table) clane$ b2t2 index -o ds102.parquet bids-examples/ds102 |
| 64 | +ds102: 100%|███████████████████████████████████████| 26/26 [00:00<00:00, 154.12it/s, sub=26, N=130] |
| 65 | +``` |
| 66 | +
|
| 67 | +You can also index a list of datasets. Note that each iteration in the progress bar represents one dataset. |
| 68 | +
|
| 69 | +``` |
| 70 | +(bids2table) clane$ b2t2 index -o bids-examples.parquet bids-examples/* |
| 71 | +100%|████████████████████████████████████████████| 87/87 [00:00<00:00, 113.59it/s, ds=None, N=9727] |
| 72 | +``` |
| 73 | +
|
| 74 | +You can pipe the output of `b2t2 find` to `b2t2 index` to create an index of all datasets under a root directory. |
| 75 | +
|
| 76 | +``` |
| 77 | +(bids2table) clane$ b2t2 find bids-examples | b2t2 index -o bids-examples.parquet |
| 78 | +97it [00:01, 96.05it/s, ds=ieeg_filtered_speech, N=10K] |
| 79 | +``` |
| 80 | +
|
| 81 | +The resulting index will include both top-level datasets (as in the previous command) as well nested derivatives datasets. |
| 82 | +
|
| 83 | +### Indexing datasets hosted on S3 |
| 84 | +
|
| 85 | +bids2table supports indexing datasets hosted on S3 via [cloudpathlib](https://github.com/drivendataorg/cloudpathlib). To use this functionality, make sure to install bids2table with the `s3` extra. Or you can also just install cloudpathlib directly |
| 86 | +
|
| 87 | +```sh |
| 88 | +pip install cloudpathlib[s3] |
| 89 | +``` |
| 90 | +
|
| 91 | +As an example, here we index all datasets on [OpenNeuro](https://openneuro.org/) |
| 92 | +
|
| 93 | +``` |
| 94 | +(bids2table) clane$ b2t2 index -o openneuro.parquet \ |
| 95 | + -j 8 --use-threads s3://openneuro.org/ds* |
| 96 | +100%|█████████████████████████████████████| 1408/1408 [12:25<00:00, 1.89it/s, ds=ds006193, N=1.2M] |
| 97 | +``` |
| 98 | +
|
| 99 | +Using 8 threads, we can index all ~1400 OpenNeuro datasets (1.2M files) in less than 15 minutes. |
| 100 | +
|
| 101 | +
|
| 102 | +### Indexing datasets from python |
| 103 | +
|
| 104 | +You can also index datasets using the Python API. |
| 105 | +
|
| 106 | +```python |
| 107 | +import bids2table as b2t2 |
| 108 | +import pandas as pd |
| 109 | +import pyarrow as pa |
| 110 | +import pyarrow.parquet as pq |
| 111 | +
|
| 112 | +# Index a single dataset. |
| 113 | +tab = b2t2.index_dataset("bids-examples/ds102") |
| 114 | +
|
| 115 | +# Find and index a batch of datasets. |
| 116 | +tabs = b2t2.batch_index_dataset( |
| 117 | + b2t2.find_bids_datasets("bids-examples"), |
| 118 | +) |
| 119 | +tab = pa.concat_tables(tabs) |
| 120 | +
|
| 121 | +# Index a dataset on S3. |
| 122 | +tab = b2t2.index_dataset("s3://openneuro.org/ds000224") |
| 123 | +
|
| 124 | +# Save as parquet. |
| 125 | +pq.write_table(tab, "ds000224.parquet") |
| 126 | +
|
| 127 | +# Convert to a pandas dataframe. |
| 128 | +df = tab.to_pandas(types_mapper=pd.ArrowDtype) |
| 129 | +``` |
| 130 | +""" |
| 131 | + |
| 132 | +__all__ = [ |
| 133 | + "index_dataset", |
| 134 | + "batch_index_dataset", |
| 135 | + "find_bids_datasets", |
| 136 | + "get_arrow_schema", |
| 137 | + "get_column_names", |
| 138 | + "parse_bids_entities", |
| 139 | + "validate_bids_entities", |
| 140 | + "set_bids_schema", |
| 141 | + "get_bids_schema", |
| 142 | + "get_bids_entity_arrow_schema", |
| 143 | + "format_bids_path", |
| 144 | + "cloudpathlib_is_available", |
| 145 | +] |
| 146 | + |
| 147 | +from ._indexing import ( |
| 148 | + index_dataset, |
| 149 | + batch_index_dataset, |
| 150 | + find_bids_datasets, |
| 151 | + get_arrow_schema, |
| 152 | + get_column_names, |
| 153 | +) |
4 | 154 | from ._entities import ( |
5 | 155 | parse_bids_entities, |
6 | 156 | validate_bids_entities, |
|
9 | 159 | get_bids_entity_arrow_schema, |
10 | 160 | format_bids_path, |
11 | 161 | ) |
12 | | -from ._indexing import ( |
13 | | - find_bids_datasets, |
14 | | - index_dataset, |
15 | | - batch_index_dataset, |
16 | | - get_arrow_schema, |
17 | | - get_column_names, |
18 | | -) |
19 | 162 | from ._pathlib import Path, cloudpathlib_is_available |
20 | 163 | from ._version import * |
0 commit comments