Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions cubids/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,20 @@ def _parse_apply():
required=False,
)

parser.add_argument(
"--n-cpus",
"--n_cpus",
type=int,
action="store",
dest="n_cpus",
default=1,
help=(
"Number of CPUs to use for datalad jobs. "
"Used as --jobs for datalad save and datalad run."
),
required=False,
)

return parser


Expand Down
319 changes: 201 additions & 118 deletions cubids/cubids.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion cubids/metadata_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ def group_by_acquisition_sets(files_tsv, output_prefix, acq_group_level, is_long

# Write the mapping of subject/session to
acq_group_df = pd.DataFrame(grouped_sub_sess)
acq_group_df.to_csv(output_prefix + "_AcqGrouping.tsv", sep="\t", index=False)
acq_group_df.to_csv(output_prefix + "_AcqGrouping.tsv", sep="\t", index=False, na_rep="n/a")

# Create data dictionary for acq group tsv
acq_dict = get_acq_dictionary(is_longitudinal)
Expand Down
4 changes: 2 additions & 2 deletions cubids/tests/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,12 +323,12 @@ def test_cubids_apply_intendedfor(
# Create a CuBIDS summary tsv
summary_tsv = tmpdir / "summary.tsv"
df = pd.DataFrame(summary_data)
df.to_csv(summary_tsv, sep="\t", index=False)
df.to_csv(summary_tsv, sep="\t", index=False, na_rep="n/a")

# Create a CuBIDS files tsv
files_tsv = tmpdir / "files.tsv"
df = pd.DataFrame(fdata)
df.to_csv(files_tsv, sep="\t", index=False)
df.to_csv(files_tsv, sep="\t", index=False, na_rep="n/a")

# Run cubids apply
if isinstance(expected, str):
Expand Down
8 changes: 4 additions & 4 deletions cubids/tests/test_bond.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ def test_tsv_merge_no_datalad(tmp_path):
summary_df.loc[fa_nan_dwi_row, "MergeInto"] = summary_df.ParamGroup[complete_dwi_row]

valid_tsv_file = tsv_prefix + "_valid_summary.tsv"
summary_df.to_csv(valid_tsv_file, sep="\t", index=False)
summary_df.to_csv(valid_tsv_file, sep="\t", index=False, na_rep="n/a")

# about to apply merges!

Expand All @@ -472,7 +472,7 @@ def test_tsv_merge_no_datalad(tmp_path):
complete_dwi_row
]
invalid_tsv_file = tsv_prefix + "_invalid_summary.tsv"
summary_df.to_csv(invalid_tsv_file, sep="\t", index=False)
summary_df.to_csv(invalid_tsv_file, sep="\t", index=False, na_rep="n/a")

with pytest.raises(Exception):
bod.apply_tsv_changes(
Expand Down Expand Up @@ -572,7 +572,7 @@ def test_tsv_merge_changes(tmp_path):
summary_df.loc[fa_nan_dwi_row, "MergeInto"] = summary_df.ParamGroup[complete_dwi_row]

valid_tsv_file = tsv_prefix + "_valid_summary.tsv"
summary_df.to_csv(valid_tsv_file, sep="\t", index=False)
summary_df.to_csv(valid_tsv_file, sep="\t", index=False, na_rep="n/a")

# about to merge
bod.apply_tsv_changes(valid_tsv_file, original_files_tsv, str(tmp_path / "ok_modified"))
Expand All @@ -584,7 +584,7 @@ def test_tsv_merge_changes(tmp_path):
complete_dwi_row
]
invalid_tsv_file = tsv_prefix + "_invalid_summary.tsv"
summary_df.to_csv(invalid_tsv_file, sep="\t", index=False)
summary_df.to_csv(invalid_tsv_file, sep="\t", index=False, na_rep="n/a")

with pytest.raises(Exception):
bod.apply_tsv_changes(
Expand Down
2 changes: 1 addition & 1 deletion cubids/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def _add_deletion(summary_tsv):
"""
df = pd.read_table(summary_tsv)
df.loc[3, "MergeInto"] = 0
df.to_csv(summary_tsv, sep="\t", index=False)
df.to_csv(summary_tsv, sep="\t", index=False, na_rep="n/a")
return df.loc[3, "KeyParamGroup"]


Expand Down
75 changes: 45 additions & 30 deletions cubids/workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,34 +127,44 @@ def _link_or_copy(src_path, dst_path):
if not os.path.exists(subject_folder_path):
os.makedirs(subject_folder_path, exist_ok=True)

# Ensure participants.tsv is available in temp root and is a copy (not a link)
# Always COPY (never link) to avoid modifying the original file when filtering
participants_tsv_path = os.path.join(temporary_bids_dir, "participants.tsv")
# Always remove existing file first in case it was linked in the earlier loop
if os.path.exists(participants_tsv_path):
# Ensure participants.tsv and participants.json are available in temp root
# Always COPY (never link) to avoid modifying the original files when filtering
participants_files = ["participants.tsv", "participants.json"]
for filename in participants_files:
dest_path = os.path.join(temporary_bids_dir, filename)
# Always remove existing file first in case it was linked in the earlier loop
if os.path.exists(dest_path):
try:
os.remove(dest_path)
except Exception as e:
logger.warning(
f"Failed to remove existing file '{dest_path}': {e}. "
"The file may be overwritten or cause conflicts."
)
# Try to find source file in the provided file list
try:
os.remove(participants_tsv_path)
except Exception: # noqa: BLE001
pass
# Try to find a source participants.tsv in the provided file list
try:
source_participants_tsv_path = None
for candidate_path in files_list:
if os.path.basename(candidate_path) == "participants.tsv":
source_participants_tsv_path = candidate_path
break
# If not in file list, try to get it from the original bids_dir
if not source_participants_tsv_path and bids_dir:
potential_path = os.path.join(bids_dir, "participants.tsv")
if os.path.exists(potential_path):
source_participants_tsv_path = potential_path
if source_participants_tsv_path:
# Always copy (not link) to protect the original file from modification
shutil.copy2(source_participants_tsv_path, participants_tsv_path)
except Exception: # noqa: BLE001
pass
source_path = None
for candidate_path in files_list:
if os.path.basename(candidate_path) == filename:
source_path = candidate_path
break
# If not in file list, try to get it from the original bids_dir
if not source_path and bids_dir:
potential_path = os.path.join(bids_dir, filename)
if os.path.exists(potential_path):
source_path = potential_path
if source_path:
# Always copy (not link) to protect the original file from modification
shutil.copy2(source_path, dest_path)
except Exception as e:
source_info = source_path if source_path else "unknown location"
logger.warning(
f"Failed to copy '{filename}' from '{source_info}' to '{dest_path}': {e}. "
"The file may be missing or inaccessible."
)

# If participants.tsv exists in the temp BIDS root, filter to current subject
participants_tsv_path = os.path.join(temporary_bids_dir, "participants.tsv")
if os.path.exists(participants_tsv_path):
try:
participants_table = pd.read_csv(participants_tsv_path, sep="\t")
Expand All @@ -166,10 +176,13 @@ def _link_or_copy(src_path, dst_path):
participants_tsv_path,
sep="\t",
index=False,
na_rep="n/a",
)
except Exception as e: # noqa: F841
# Non-fatal: continue validation even if filtering fails
pass
except Exception as e:
logger.warning(
f"Failed to filter participants.tsv for subject {subject}: {e}. "
"Continuing validation without filtering."
)

# Run the validator
call = build_validator_call(
Expand Down Expand Up @@ -271,7 +284,7 @@ def validate(
else:
val_tsv = str(bids_dir) + "/code/CuBIDS/" + str(output_prefix) + "_validation.tsv"

parsed.to_csv(val_tsv, sep="\t", index=False)
parsed.to_csv(val_tsv, sep="\t", index=False, na_rep="n/a")

# build validation data dictionary json sidecar
val_dict = get_val_dictionary()
Expand Down Expand Up @@ -363,7 +376,7 @@ def validate(
else:
val_tsv = str(bids_dir) + "/code/CuBIDS/" + str(output_prefix) + "_validation.tsv"

parsed.to_csv(val_tsv, sep="\t", index=False)
parsed.to_csv(val_tsv, sep="\t", index=False, na_rep="n/a")

# build validation data dictionary json sidecar
val_dict = get_val_dictionary()
Expand Down Expand Up @@ -487,6 +500,7 @@ def apply(
edited_summary_tsv,
files_tsv,
new_tsv_prefix,
n_cpus=1,
):
"""Apply the tsv changes.

Expand Down Expand Up @@ -525,6 +539,7 @@ def apply(
str(files_tsv),
str(new_tsv_prefix),
raise_on_error=False,
n_cpus=n_cpus,
)


Expand Down
8 changes: 8 additions & 0 deletions docs/example.rst
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,14 @@ We can execute ``cubids apply`` with the following command:

$ cubids apply BIDS_Dataset_DataLad v0_edited_summary.tsv v0_files.tsv v1 --use-datalad

.. note::
For large datasets, you can speed up DataLad operations by using the ``--n-cpus`` flag
to enable parallel jobs for ``datalad save`` and ``datalad run`` operations. For example, to use 4 CPUs:

.. code-block:: console

$ cubids apply BIDS_Dataset_DataLad v0_edited_summary.tsv v0_files.tsv v1 --use-datalad --n-cpus 4

Checking our git log, we can see that our changes from apply have been saved.

.. image:: _static/screenshot_7.png
Expand Down