rf(indexing): thread schema through batch_index_dataset

effigies · effigies · commit 5e98d7271ae2 · 2026-04-30T04:26:28.000-04:00
diff --git a/bids2table/_indexing.py b/bids2table/_indexing.py
@@ -278,25 +278,32 @@ def batch_index_dataset(
     max_workers: int | None = 0,
     executor_cls: type[Executor] = ProcessPoolExecutor,
     show_progress: bool = False,
+    schema: BIDSSchema | pa.Schema | Namespace | str | Path | None = None,
 ) -> Generator[pa.Table, None, None]:
     """Index a batch of BIDS datasets.
 
     Args:
         roots: List of BIDS dataset root directories.
         max_workers: Number of indexing processes to run in parallel. Setting
             `max_workers=0` (the default) uses the main process only. Setting
-            `max_workers=None` starts as many workers as there are available CPUs. See
-            `concurrent.futures.ProcessPoolExecutor` for details.
+            `max_workers=None` starts as many workers as there are available CPUs.
+            See `concurrent.futures.ProcessPoolExecutor` for details.
         executor_cls: Executor class to use for parallel indexing.
         show_progress: Show progress bar.
+        schema: A `BIDSSchema`, `pa.Schema`, `Namespace`, path/URL, or None to use
+            the module-level default.
 
     Yields:
         An Arrow table index for each BIDS dataset.
     """
+    bids_schema = _resolve(schema)
+    entity_arrow_schema = bids_schema.arrow_schema
+    func = partial(_batch_index_func, schema=entity_arrow_schema)
+
     file_count = 0
     for dataset, table in (
         pbar := tqdm(
-            _pmap(_batch_index_func, roots, max_workers, executor_cls=executor_cls),
+            _pmap(func, roots, max_workers, executor_cls=executor_cls),
             total=len(roots) if isinstance(roots, Sequence) else None,
             disable=show_progress not in {True, "dataset"},
         )
@@ -306,9 +313,12 @@ def batch_index_dataset(
         yield table
 
 
-def _batch_index_func(root: str | PathT) -> tuple[str | None, pa.Table]:
+def _batch_index_func(
+    root: str | PathT,
+    schema: pa.Schema | None = None,
+) -> tuple[str | None, pa.Table]:
     dataset, _ = _get_bids_dataset(root)
-    table = index_dataset(root, max_workers=0, show_progress=False)
+    table = index_dataset(root, max_workers=0, show_progress=False, schema=schema)
     return dataset, table
 
 
diff --git a/tests/test_indexing.py b/tests/test_indexing.py
@@ -293,3 +293,12 @@ def test_index_dataset_workers_honor_explicit_schema():
         BIDS_EXAMPLES / "ds102", schema=tagged, max_workers=2
     )
     assert table.schema.metadata[b"test_marker"] == b"tagged"
+
+
+def test_batch_index_dataset_with_explicit_schema():
+    s = BIDSSchema.from_path(None)
+    roots = [p.parent for p in BIDS_EXAMPLES.glob("*/dataset_description.json")][:2]
+    tables = list(indexing.batch_index_dataset(roots, schema=s))
+    assert len(tables) == len(roots)
+    for t in tables:
+        assert "sub" in t.schema.names