Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
259df02
feat(schema): add BIDSSchema.from_path skeleton
effigies Apr 30, 2026
f31f666
feat(schema): add BIDSSchema.from_namespace and lazy bids_schema
effigies Apr 30, 2026
11f4b0a
feat(schema): add BIDSSchema.from_arrow reconstruction
effigies Apr 30, 2026
dbb6469
rf(schema): tighten exception clause and document encoding
effigies Apr 30, 2026
3edae56
feat(schema): add BIDSSchema.prepare polymorphic constructor
effigies Apr 30, 2026
e1e82f7
test(schema): cover str/Path dispatch and tighten error-message regex
effigies Apr 30, 2026
8aa44ab
test(schema): pickle round-trip for BIDSSchema
effigies Apr 30, 2026
23e2323
test(schema): assert pickle re-materialization and document setstate
effigies Apr 30, 2026
99b30cc
feat(schema): add _DEFAULT_SCHEMA with rebindable set_bids_schema
effigies Apr 30, 2026
c9e110d
rf(entities): take schema arg, drop module globals
effigies Apr 30, 2026
6f7b07e
rf(schema): add BIDSSchema.lookups accessors; remove private cross-mo…
effigies Apr 30, 2026
f9040c4
rf(indexing): get_arrow_schema and get_column_names accept schema
effigies Apr 30, 2026
aaaad31
rf(indexing): simplify get_arrow_schema dispatch and document get_col…
effigies Apr 30, 2026
de3566c
rf(indexing): thread schema through index_dataset
effigies Apr 30, 2026
2e42736
test(indexing): regression test for worker schema propagation
effigies Apr 30, 2026
888b0c2
rf(indexing): thread schema through batch_index_dataset
effigies Apr 30, 2026
9855b03
test(indexing): integration test for two schemas in one process
effigies Apr 30, 2026
03b485a
doc: document custom and multiple BIDS schemas
effigies Apr 30, 2026
d3352c3
rf: Import schema functions from _schema
effigies Apr 30, 2026
7924e38
rf: Unify BIDSSchema preparation, drop backwards compatibility
effigies Apr 30, 2026
a87584c
rf: Rename schema adapter, drop bids_schema reference, default load(N…
effigies May 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,20 @@ pq.write_table(tab, "ds000224.parquet")
# Convert to a pandas dataframe.
df = tab.to_pandas(types_mapper=pd.ArrowDtype)
```

### Using a custom or multiple BIDS schemas

`bids2table` loads the bidsschematools default schema at import. For most
users this is the right thing. To index a dataset against a specific schema
version or path, pass a reference to the schema as the `schema` argument:

```python
from bids2table import index_dataset

table = index_dataset("/path/to/dataset", schema="/path/to/bids-schema")
```

`schema=` accepts a `pa.Schema`, a bidsschematools `Namespace`,
a path/URL, or `None` (the module-level default).
Passing different schemas to different `index_dataset` calls in the same
process is supported and works under multiprocessing.
6 changes: 0 additions & 6 deletions bids2table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,14 @@
"get_column_names",
"parse_bids_entities",
"validate_bids_entities",
"set_bids_schema",
"get_bids_schema",
"get_bids_entity_arrow_schema",
"format_bids_path",
"load_bids_metadata",
"cloudpathlib_is_available",
]

from ._entities import (
format_bids_path,
get_bids_entity_arrow_schema,
get_bids_schema,
parse_bids_entities,
set_bids_schema,
validate_bids_entities,
)
from ._indexing import (
Expand Down
132 changes: 15 additions & 117 deletions bids2table/_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,62 +3,20 @@
Uses the BIDS schema for validation.
"""

import json
from __future__ import annotations

import re
from functools import lru_cache
from pathlib import Path
from typing import Any

import bidsschematools.schema
import pyarrow as pa
from bidsschematools.types import Namespace

from ._logging import setup_logger
from ._schema import SchemaAdapter

BIDSValue = str | int

# Global BIDS schema namespace.
_BIDS_SCHEMA: Namespace
# Map of entity names to schema metadata.
_BIDS_ENTITY_SCHEMA: dict[str, dict[str, Any]]
# Map of BIDS short names (e.g. 'sub') to long entities ('subject').
_BIDS_NAME_ENTITY_MAP: dict[str, str]

# BIDS schema in Arrow format
_BIDS_ENTITY_ARROW_SCHEMA: pa.Schema

# "Special" entities that are part of the BIDS file name spec but not in the BIDS schema
# (bc they don't follow the '{key}-{value}' format).
_BIDS_SPECIAL_ENTITY_SCHEMA = {
"datatype": {
"name": "datatype",
"display_name": "Data type",
"description": "A functional group of different types of data.",
"type": "string",
"format": "special",
},
"suffix": {
"name": "suffix",
"display_name": "Suffix",
"description": "Final part of file name after final '_' and before extension.",
"type": "string",
"format": "special",
},
"extension": {
"name": "ext",
"display_name": "File extension",
"description": "Full file extension after the left-most period.",
"type": "string",
"format": "special",
},
}

_BIDS_FORMAT_ARROW_DTYPE_MAP = {
"index": pa.int32(),
"label": pa.string(),
"special": pa.string(),
}

_BIDS_FORMAT_PY_TYPE_MAP = {
"index": int,
"label": str,
Expand All @@ -74,66 +32,6 @@
_logger = setup_logger(__package__)


def set_bids_schema(path: str | Path | None = None) -> None:
"""Set the BIDS schema."""
global _BIDS_SCHEMA, _BIDS_ENTITY_SCHEMA, _BIDS_NAME_ENTITY_MAP
global _BIDS_ENTITY_ARROW_SCHEMA

schema = bidsschematools.schema.load_schema(path)
entity_schema = {
entity: schema.objects.entities[entity].to_dict()
for entity in schema.rules.entities
}
# Also include special extra entities (datatype, suffix, extension).
entity_schema.update(_BIDS_SPECIAL_ENTITY_SCHEMA)
name_entity_map = {cfg["name"]: entity for entity, cfg in entity_schema.items()}

_BIDS_SCHEMA = schema
_BIDS_ENTITY_SCHEMA = entity_schema
_BIDS_NAME_ENTITY_MAP = name_entity_map

_BIDS_ENTITY_ARROW_SCHEMA = _bids_entity_arrow_schema(
entity_schema,
bids_version=schema["bids_version"],
schema_version=schema["schema_version"],
)


def _bids_entity_arrow_schema(
entity_schema: dict[str, dict[str, Any]],
bids_version: str,
schema_version: str,
) -> pa.Schema:
"""Create Arrow schema from BIDS entity schema."""
fields = []
for entity, cfg in entity_schema.items():
# Use short entity name (e.g. sub) as the field name.
name = cfg["name"]
dtype = _BIDS_FORMAT_ARROW_DTYPE_MAP[cfg["format"]]
# Insert full entity name (e.g. subject) into metadata.
metadata = {"entity": entity}
metadata.update(
{k: v if isinstance(v, str) else json.dumps(v) for k, v in cfg.items()}
)

field = pa.field(name, dtype, metadata=metadata)
fields.append(field)

metadata = {"bids_version": bids_version, "schema_version": schema_version}
arrow_schema = pa.schema(fields, metadata=metadata)
return arrow_schema


def get_bids_schema() -> Namespace:
"""Get the current BIDS schema."""
return _BIDS_SCHEMA


def get_bids_entity_arrow_schema() -> pa.Schema:
"""Get the current BIDS entity schema in Arrow format."""
return _BIDS_ENTITY_ARROW_SCHEMA


def parse_bids_entities(path: str | Path) -> dict[str, str]:
"""Parse entities from BIDS file path.

Expand All @@ -148,7 +46,7 @@ def parse_bids_entities(path: str | Path) -> dict[str, str]:
"""
if isinstance(path, str):
path = Path(path)
entities = {}
entities: dict[str, str] = {}

filename = path.name
parts = filename.split("_")
Expand Down Expand Up @@ -195,27 +93,31 @@ def _parse_bids_datatype(path: Path) -> str | None:

def validate_bids_entities(
entities: dict[str, Any],
schema: SchemaAdapter | pa.Schema | None = None,
) -> tuple[dict[str, BIDSValue], dict[str, Any]]:
"""Validate BIDS entities.
"""Validate BIDS entities against a schema.

Validates the type and allowed values of each entity against the BIDS schema.

Args:
entities: dict mapping BIDS keys to unvalidated entities
entities: dict mapping BIDS keys to unvalidated entities.
schema: A `SchemaAdapter`, a `pa.Schema` (e.g. inside a worker process), or
None to use the module-level default.

Returns:
A tuple of `(valid_entities, extra_entities)`, where `valid_entities` is a
mapping of valid BIDS keys to type-casted values, and `extra_entities` a
mapping of any leftover entity mappings that didn't match a known entity or
failed validation.
"""
valid_entities = {}
extra_entities = {}
schema_adapter = SchemaAdapter.load(schema)

valid_entities: dict[str, BIDSValue] = {}
extra_entities: dict[str, str] = {}

for name, value in entities.items():
if name in _BIDS_NAME_ENTITY_MAP:
entity = _BIDS_NAME_ENTITY_MAP[name]
cfg = _BIDS_ENTITY_SCHEMA[entity]
if entity := schema_adapter.name_entity_map.get(name):
cfg = schema_adapter.entity_schema[entity]
typ = _BIDS_FORMAT_PY_TYPE_MAP[cfg["format"]]

# Cast to target type.
Expand Down Expand Up @@ -279,7 +181,3 @@ def format_bids_path(entities: dict[str, Any], int_format: str = "%d") -> Path:
path = f"ses-{ses}" / path
path = f"sub-{entities['sub']}" / path
return path


# Initialize the default BIDS schema.
set_bids_schema()
Loading
Loading