Skip to content

Commit fc57805

Browse files
committed
change cli and try to fix the PARTICIPANT_ID_MISMATCH
1 parent 9463421 commit fc57805

File tree

4 files changed

+114
-66
lines changed

4 files changed

+114
-66
lines changed

cubids/cli.py

Lines changed: 14 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -106,11 +106,10 @@ def _parse_validate():
106106
If a filename prefix is provided, the output will be placed in
107107
bids_dir/code/CuBIDS. If a full path is provided, the output files will
108108
go to the specified location.
109-
- --sequential: Run the BIDS validator sequentially on each subject.
109+
- --validation-scope: Choose between 'dataset' (default) or 'subject' validation.
110110
- --container: Docker image tag or Singularity image file.
111111
- --ignore-nifti-headers: Disregard NIfTI header content during validation.
112-
- --sequential-subjects: Filter the sequential run to only include the
113-
listed subjects.
112+
- --participant-label: Filter the validation to only include the listed subjects.
114113
"""
115114
parser = argparse.ArgumentParser(
116115
description="cubids validate: Wrapper around the official BIDS Validator",
@@ -143,10 +142,13 @@ def _parse_validate():
143142
),
144143
)
145144
parser.add_argument(
146-
"--sequential",
147-
action="store_true",
148-
default=False,
149-
help="Run the BIDS validator sequentially on each subject.",
145+
"--validation-scope",
146+
choices=["dataset", "subject"],
147+
default="dataset",
148+
help=(
149+
"Scope of validation. 'dataset' validates the entire dataset (default). "
150+
"'subject' validates each subject separately."
151+
),
150152
required=False,
151153
)
152154
parser.add_argument(
@@ -157,12 +159,12 @@ def _parse_validate():
157159
required=False,
158160
)
159161
parser.add_argument(
160-
"--sequential-subjects",
162+
"--participant-label",
161163
action="store",
162164
default=None,
163165
help=(
164-
"List: Filter the sequential run to only include "
165-
"the listed subjects. e.g. --sequential-subjects "
166+
"List: Filter the validation to only include "
167+
"the listed subjects. e.g. --participant-label "
166168
"sub-01 sub-02 sub-03"
167169
),
168170
nargs="+",
@@ -194,24 +196,12 @@ def _parse_validate():
194196
dest="n_cpus",
195197
default=1,
196198
help=(
197-
"Number of CPUs to use for parallel validation when --sequential is used. "
199+
"Number of CPUs to use for parallel validation "
200+
"when `--validation-scope` is 'subject'. "
198201
"Defaults to 1 (sequential processing)."
199202
),
200203
required=False,
201204
)
202-
parser.add_argument(
203-
"--max-workers",
204-
type=int,
205-
action="store",
206-
dest="max_workers",
207-
default=None,
208-
help=(
209-
"Maximum number of parallel workers to use for validation. "
210-
"If not specified, automatically optimized for I/O-bound workloads. "
211-
"Set this to explicitly control parallelism (e.g., to avoid disk I/O contention)."
212-
),
213-
required=False,
214-
)
215205
return parser
216206

217207

cubids/tests/test_cli.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -249,18 +249,18 @@ def test_validate_command_with_test_dataset(tmp_path):
249249
assert (output_prefix.parent / f"{output_prefix.name}_validation.json").exists()
250250

251251

252-
def test_validate_sequential_with_n_cpus(tmp_path):
253-
"""Test the validate command with sequential flag and n_cpus parallelization."""
252+
def test_validate_subject_scope_with_n_cpus(tmp_path):
253+
"""Test the validate command with validation-scope subject and n_cpus parallelization."""
254254
# Copy test dataset to temporary directory
255255
test_data = TEST_DATA / "BIDS_Dataset"
256256
bids_dir = tmp_path / "BIDS_Dataset"
257257
shutil.copytree(test_data, bids_dir)
258258

259-
# Run sequential validation with 2 CPUs (parallel processing)
259+
# Run subject-level validation with 2 CPUs (parallel processing)
260260
output_prefix = tmp_path / "validation_parallel"
261261

262262
# This should complete without error
263-
_main(["validate", str(bids_dir), str(output_prefix), "--sequential", "--n-cpus", "1"])
263+
_main(["validate", str(bids_dir), str(output_prefix), "--validation-scope", "subject", "--n-cpus", "1"])
264264

265265
# Verify the command completed successfully by checking if the output files exist
266266
assert (output_prefix.parent / f"{output_prefix.name}_validation.tsv").exists()

cubids/workflows.py

Lines changed: 91 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -86,27 +86,63 @@ def _link_or_copy(src_path, dst_path):
8686
shutil.copy2(src_path, dst_path)
8787

8888
# Create temporary directory and populate with links
89-
with tempfile.TemporaryDirectory() as tmpdir:
89+
with tempfile.TemporaryDirectory() as temporary_bids_dir:
9090
for file_path in files_list:
9191
# Cut the path down to the subject label
9292
bids_start = file_path.find(subject)
9393

9494
# Maybe it's a single file (root-level file)
9595
if bids_start < 1:
96-
tmp_file_dir = tmpdir
96+
tmp_file_dir = temporary_bids_dir
9797
else:
9898
bids_folder = Path(file_path[bids_start:]).parent
99-
tmp_file_dir = os.path.join(tmpdir, str(bids_folder))
99+
tmp_file_dir = os.path.join(temporary_bids_dir, str(bids_folder))
100100

101101
if not os.path.exists(tmp_file_dir):
102102
os.makedirs(tmp_file_dir)
103103

104104
output_path = os.path.join(tmp_file_dir, str(Path(file_path).name))
105105
_link_or_copy(file_path, output_path)
106106

107+
# Ensure participants.tsv is available in temp root
108+
# copy from original file list if missing
109+
participants_tsv_path = os.path.join(temporary_bids_dir, "participants.tsv")
110+
if not os.path.exists(participants_tsv_path):
111+
# Try to find a source participants.tsv in the provided file list
112+
try:
113+
source_participants_tsv_path = None
114+
for candidate_path in files_list:
115+
if os.path.basename(candidate_path) == "participants.tsv":
116+
source_participants_tsv_path = candidate_path
117+
break
118+
if source_participants_tsv_path:
119+
_link_or_copy(source_participants_tsv_path, participants_tsv_path)
120+
except Exception: # noqa: BLE001
121+
pass
122+
123+
# If participants.tsv exists in the temp BIDS root, filter to current subject
124+
if os.path.exists(participants_tsv_path):
125+
try:
126+
participants_table = pd.read_csv(participants_tsv_path, sep="\t")
127+
if "participant_id" in participants_table.columns:
128+
participant_ids = participants_table["participant_id"]
129+
is_current_subject = participant_ids.eq(subject)
130+
participants_table = participants_table[is_current_subject]
131+
participants_table.to_csv(
132+
participants_tsv_path,
133+
sep="\t",
134+
index=False,
135+
)
136+
except Exception as e: # noqa: F841
137+
# Non-fatal: continue validation even if filtering fails
138+
pass
139+
107140
# Run the validator
108141
call = build_validator_call(
109-
tmpdir, local_validator, ignore_nifti_headers, schema=schema_path
142+
temporary_bids_dir,
143+
local_validator,
144+
ignore_nifti_headers,
145+
schema=schema_path,
110146
)
111147
result = run_validator(call)
112148

@@ -124,13 +160,12 @@ def _link_or_copy(src_path, dst_path):
124160
def validate(
125161
bids_dir,
126162
output_prefix,
127-
sequential,
128-
sequential_subjects,
163+
validation_scope,
164+
participant_label,
129165
local_validator,
130166
ignore_nifti_headers,
131167
schema,
132168
n_cpus=1,
133-
max_workers=None,
134169
):
135170
"""Run the bids validator.
136171
@@ -140,35 +175,28 @@ def validate(
140175
Path to the BIDS directory.
141176
output_prefix : :obj:`pathlib.Path`
142177
Output filename prefix.
143-
sequential : :obj:`bool`
144-
Run the validator sequentially.
145-
sequential_subjects : :obj:`list` of :obj:`str`
146-
Filter the sequential run to only include the listed subjects.
178+
validation_scope : :obj:`str`
179+
Scope of validation: 'dataset' validates the entire dataset,
180+
'subject' validates each subject separately.
181+
participant_label : :obj:`list` of :obj:`str`
182+
Filter the validation to only include the listed subjects.
147183
local_validator : :obj:`bool`
148184
Use the local bids validator.
149185
ignore_nifti_headers : :obj:`bool`
150186
Ignore NIfTI headers when validating.
151187
schema : :obj:`pathlib.Path` or None
152188
Path to the BIDS schema file.
153189
n_cpus : :obj:`int`
154-
Number of CPUs to use for parallel validation (only when sequential=True).
190+
Number of CPUs to use for parallel validation (only when validation_scope='subject').
155191
Default is 1 (sequential processing).
156-
max_workers : :obj:`int` or None
157-
Maximum number of parallel workers. If None, automatically optimized
158-
using formula: sqrt(n_cpus * 16) to balance I/O throughput. Set explicitly
159-
to override (e.g., for I/O-constrained systems).
160192
"""
161193
# Ensure n_cpus is at least 1
162194
n_cpus = max(1, n_cpus)
163-
# Derive effective worker count: honor explicit max_workers; otherwise use heuristic
164-
if max_workers is not None:
165-
effective_workers = max(1, int(max_workers))
166-
else:
167-
# Heuristic tuned for I/O-bound workloads materializing files + validator runs.
168-
# sqrt(n_cpus * 16) caps concurrency to avoid disk thrashing while keeping CPU busy.
169-
effective_workers = max(1, int((n_cpus * 16) ** 0.5))
170-
# Do not exceed n_cpus unless user explicitly asks via --max-workers
171-
effective_workers = min(effective_workers, n_cpus)
195+
# Derive effective worker count using heuristic
196+
# Heuristic tuned for I/O-bound workloads materializing files + validator runs.
197+
effective_workers = max(1, int((n_cpus * 16) ** 0.5))
198+
# Do not exceed n_cpus
199+
effective_workers = min(effective_workers, n_cpus)
172200

173201
# check status of output_prefix, absolute or relative?
174202
abs_path_output = True
@@ -182,7 +210,7 @@ def validate(
182210
subprocess.run(["mkdir", str(bids_dir / "code" / "CuBIDS")])
183211

184212
# Run directly from python using subprocess
185-
if not sequential:
213+
if validation_scope == "dataset":
186214
# run on full dataset
187215
call = build_validator_call(
188216
str(bids_dir),
@@ -236,8 +264,8 @@ def validate(
236264

237265
parsed = []
238266

239-
if sequential_subjects:
240-
subjects_dict = {k: v for k, v in subjects_dict.items() if k in sequential_subjects}
267+
if participant_label:
268+
subjects_dict = {k: v for k, v in subjects_dict.items() if k in participant_label}
241269
assert len(list(subjects_dict.keys())) > 1, "No subjects found in filter"
242270

243271
# Convert schema Path to string if it exists (for multiprocessing pickling)
@@ -302,26 +330,56 @@ def _link_or_copy(src_path, dst_path):
302330

303331
for subject, files_list in tqdm.tqdm(subjects_dict.items()):
304332
# Create a temporary directory and populate with links
305-
with tempfile.TemporaryDirectory() as tmpdirname:
333+
with tempfile.TemporaryDirectory() as temporary_bids_dir:
306334

307335
for file_path in files_list:
308336
bids_start = file_path.find(subject)
309337

310338
if bids_start < 1:
311-
tmp_file_dir = tmpdirname
339+
tmp_file_dir = temporary_bids_dir
312340
else:
313341
bids_folder = Path(file_path[bids_start:]).parent
314-
tmp_file_dir = os.path.join(tmpdirname, str(bids_folder))
342+
tmp_file_dir = os.path.join(temporary_bids_dir, str(bids_folder))
315343

316344
if not os.path.exists(tmp_file_dir):
317345
os.makedirs(tmp_file_dir)
318346
output = os.path.join(tmp_file_dir, str(Path(file_path).name))
319347
_link_or_copy(file_path, output)
320348

349+
# Ensure participants.tsv exists; copy if missing, then filter
350+
participants_tsv_path = os.path.join(temporary_bids_dir, "participants.tsv")
351+
if not os.path.exists(participants_tsv_path):
352+
try:
353+
source_participants_tsv_path = None
354+
for candidate_path in files_list:
355+
if os.path.basename(candidate_path) == "participants.tsv":
356+
source_participants_tsv_path = candidate_path
357+
break
358+
if source_participants_tsv_path:
359+
_link_or_copy(source_participants_tsv_path, participants_tsv_path)
360+
except Exception: # noqa: BLE001
361+
pass
362+
363+
if os.path.exists(participants_tsv_path):
364+
try:
365+
participants_table = pd.read_csv(participants_tsv_path, sep="\t")
366+
if "participant_id" in participants_table.columns:
367+
participant_ids = participants_table["participant_id"]
368+
is_current_subject = participant_ids.eq(subject)
369+
participants_table = participants_table[is_current_subject]
370+
participants_table.to_csv(
371+
participants_tsv_path,
372+
sep="\t",
373+
index=False,
374+
)
375+
except Exception as e: # noqa: F841
376+
# Non-fatal: continue validation even if filtering fails
377+
pass
378+
321379
# Run the validator
322380
nifti_head = ignore_nifti_headers
323381
call = build_validator_call(
324-
tmpdirname, local_validator, nifti_head, schema=schema
382+
temporary_bids_dir, local_validator, nifti_head, schema=schema
325383
)
326384
ret = run_validator(call)
327385
if ret.returncode != 0:
@@ -379,7 +437,7 @@ def bids_version(bids_dir, write=False, schema=None):
379437
Path to the BIDS schema file.
380438
"""
381439
# Need to run validator to get output with schema version
382-
# Copy code from `validate --sequential`
440+
# Copy code from `validate --validation-scope subject`
383441

384442
try: # return first subject
385443
# Get all folders that start with "sub-"

docs/example.rst

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -176,23 +176,23 @@ BIDS validation
176176

177177
.. code-block:: console
178178
179-
$ cubids validate BIDS_Dataset_DataLad v0 --sequential
179+
$ cubids validate BIDS_Dataset_DataLad v0 --validation-scope subject
180180
181181
.. note::
182-
The use of the ``--sequential`` flag forces the validator to treat each participant as its
182+
The use of the ``--validation-scope subject`` flag forces the validator to treat each participant as its
183183
own BIDS dataset.
184184
This can be helpful for identifying heterogeneous elements,
185185
or validating large datasets that would otherwise result in
186186
"RangeError: Invalid string length" errors when the validator crashes
187187
(producing empty STDOUT) because the JSON output is too large to serialize.
188188

189-
But ``--sequential`` can be slowed down by extremely large datasets.
189+
But ``--validation-scope subject`` can be slowed down by large datasets.
190190
To speed up validation, you can use the ``--n-cpus`` flag to enable parallel processing.
191191
For example, to validate using 4 CPUs:
192192

193193
.. code-block:: console
194194
195-
$ cubids validate BIDS_Dataset_DataLad v0 --sequential --n-cpus 4
195+
$ cubids validate BIDS_Dataset_DataLad v0 --validation-scope subject --n-cpus 4
196196
197197
.. warning::
198198
For internetless use cases, please see dedicated section of the `Installation page
@@ -285,7 +285,7 @@ To verify that there are no remaining validation errors, we rerun validation wit
285285

286286
.. code-block:: console
287287
288-
$ cubids validate BIDS_Dataset_DataLad v1 --sequential
288+
$ cubids validate BIDS_Dataset_DataLad v1 --validation-scope subject
289289
290290
This command should produce no tsv output, and instead print “No issues/warnings parsed,
291291
your dataset is BIDS valid” to the terminal,

0 commit comments

Comments
 (0)