Add parallel processing to cubids apply (#481)

tien-tong · web-flow · commit e722a67dcf53 · 2025-11-18T16:08:58.000-05:00
* add --n-cpus to cubids apply

* fix pytests

* fix pytests

* Handle both participants.tsv and participants.json in subject validation

* add logger warning

* add na_rep="n/a" to all to_csv
diff --git a/cubids/cli.py b/cubids/cli.py
@@ -535,6 +535,20 @@ def _parse_apply():
         required=False,
     )
 
+    parser.add_argument(
+        "--n-cpus",
+        "--n_cpus",
+        type=int,
+        action="store",
+        dest="n_cpus",
+        default=1,
+        help=(
+            "Number of CPUs to use for datalad jobs. "
+            "Used as --jobs for datalad save and datalad run."
+        ),
+        required=False,
+    )
+
     return parser
 
 
diff --git a/cubids/cubids.py b/cubids/cubids.py
diff --git a/cubids/metadata_merge.py b/cubids/metadata_merge.py
@@ -381,7 +381,7 @@ def group_by_acquisition_sets(files_tsv, output_prefix, acq_group_level, is_long
 
     # Write the mapping of subject/session to
     acq_group_df = pd.DataFrame(grouped_sub_sess)
-    acq_group_df.to_csv(output_prefix + "_AcqGrouping.tsv", sep="\t", index=False)
+    acq_group_df.to_csv(output_prefix + "_AcqGrouping.tsv", sep="\t", index=False, na_rep="n/a")
 
     # Create data dictionary for acq group tsv
     acq_dict = get_acq_dictionary(is_longitudinal)
diff --git a/cubids/tests/test_apply.py b/cubids/tests/test_apply.py
@@ -323,12 +323,12 @@ def test_cubids_apply_intendedfor(
     # Create a CuBIDS summary tsv
     summary_tsv = tmpdir / "summary.tsv"
     df = pd.DataFrame(summary_data)
-    df.to_csv(summary_tsv, sep="\t", index=False)
+    df.to_csv(summary_tsv, sep="\t", index=False, na_rep="n/a")
 
     # Create a CuBIDS files tsv
     files_tsv = tmpdir / "files.tsv"
     df = pd.DataFrame(fdata)
-    df.to_csv(files_tsv, sep="\t", index=False)
+    df.to_csv(files_tsv, sep="\t", index=False, na_rep="n/a")
 
     # Run cubids apply
     if isinstance(expected, str):
diff --git a/cubids/tests/test_bond.py b/cubids/tests/test_bond.py
@@ -459,7 +459,7 @@ def test_tsv_merge_no_datalad(tmp_path):
     summary_df.loc[fa_nan_dwi_row, "MergeInto"] = summary_df.ParamGroup[complete_dwi_row]
 
     valid_tsv_file = tsv_prefix + "_valid_summary.tsv"
-    summary_df.to_csv(valid_tsv_file, sep="\t", index=False)
+    summary_df.to_csv(valid_tsv_file, sep="\t", index=False, na_rep="n/a")
 
     # about to apply merges!
 
@@ -472,7 +472,7 @@ def test_tsv_merge_no_datalad(tmp_path):
         complete_dwi_row
     ]
     invalid_tsv_file = tsv_prefix + "_invalid_summary.tsv"
-    summary_df.to_csv(invalid_tsv_file, sep="\t", index=False)
+    summary_df.to_csv(invalid_tsv_file, sep="\t", index=False, na_rep="n/a")
 
     with pytest.raises(Exception):
         bod.apply_tsv_changes(
@@ -572,7 +572,7 @@ def test_tsv_merge_changes(tmp_path):
     summary_df.loc[fa_nan_dwi_row, "MergeInto"] = summary_df.ParamGroup[complete_dwi_row]
 
     valid_tsv_file = tsv_prefix + "_valid_summary.tsv"
-    summary_df.to_csv(valid_tsv_file, sep="\t", index=False)
+    summary_df.to_csv(valid_tsv_file, sep="\t", index=False, na_rep="n/a")
 
     # about to merge
     bod.apply_tsv_changes(valid_tsv_file, original_files_tsv, str(tmp_path / "ok_modified"))
@@ -584,7 +584,7 @@ def test_tsv_merge_changes(tmp_path):
         complete_dwi_row
     ]
     invalid_tsv_file = tsv_prefix + "_invalid_summary.tsv"
-    summary_df.to_csv(invalid_tsv_file, sep="\t", index=False)
+    summary_df.to_csv(invalid_tsv_file, sep="\t", index=False, na_rep="n/a")
 
     with pytest.raises(Exception):
         bod.apply_tsv_changes(
diff --git a/cubids/tests/utils.py b/cubids/tests/utils.py
@@ -108,7 +108,7 @@ def _add_deletion(summary_tsv):
     """
     df = pd.read_table(summary_tsv)
     df.loc[3, "MergeInto"] = 0
-    df.to_csv(summary_tsv, sep="\t", index=False)
+    df.to_csv(summary_tsv, sep="\t", index=False, na_rep="n/a")
     return df.loc[3, "KeyParamGroup"]
 
 
diff --git a/cubids/workflows.py b/cubids/workflows.py
@@ -127,34 +127,44 @@ def _link_or_copy(src_path, dst_path):
         if not os.path.exists(subject_folder_path):
             os.makedirs(subject_folder_path, exist_ok=True)
 
-        # Ensure participants.tsv is available in temp root and is a copy (not a link)
-        # Always COPY (never link) to avoid modifying the original file when filtering
-        participants_tsv_path = os.path.join(temporary_bids_dir, "participants.tsv")
-        # Always remove existing file first in case it was linked in the earlier loop
-        if os.path.exists(participants_tsv_path):
+        # Ensure participants.tsv and participants.json are available in temp root
+        # Always COPY (never link) to avoid modifying the original files when filtering
+        participants_files = ["participants.tsv", "participants.json"]
+        for filename in participants_files:
+            dest_path = os.path.join(temporary_bids_dir, filename)
+            # Always remove existing file first in case it was linked in the earlier loop
+            if os.path.exists(dest_path):
+                try:
+                    os.remove(dest_path)
+                except Exception as e:
+                    logger.warning(
+                        f"Failed to remove existing file '{dest_path}': {e}. "
+                        "The file may be overwritten or cause conflicts."
+                    )
+            # Try to find source file in the provided file list
             try:
-                os.remove(participants_tsv_path)
-            except Exception:  # noqa: BLE001
-                pass
-        # Try to find a source participants.tsv in the provided file list
-        try:
-            source_participants_tsv_path = None
-            for candidate_path in files_list:
-                if os.path.basename(candidate_path) == "participants.tsv":
-                    source_participants_tsv_path = candidate_path
-                    break
-            # If not in file list, try to get it from the original bids_dir
-            if not source_participants_tsv_path and bids_dir:
-                potential_path = os.path.join(bids_dir, "participants.tsv")
-                if os.path.exists(potential_path):
-                    source_participants_tsv_path = potential_path
-            if source_participants_tsv_path:
-                # Always copy (not link) to protect the original file from modification
-                shutil.copy2(source_participants_tsv_path, participants_tsv_path)
-        except Exception:  # noqa: BLE001
-            pass
+                source_path = None
+                for candidate_path in files_list:
+                    if os.path.basename(candidate_path) == filename:
+                        source_path = candidate_path
+                        break
+                # If not in file list, try to get it from the original bids_dir
+                if not source_path and bids_dir:
+                    potential_path = os.path.join(bids_dir, filename)
+                    if os.path.exists(potential_path):
+                        source_path = potential_path
+                if source_path:
+                    # Always copy (not link) to protect the original file from modification
+                    shutil.copy2(source_path, dest_path)
+            except Exception as e:
+                source_info = source_path if source_path else "unknown location"
+                logger.warning(
+                    f"Failed to copy '{filename}' from '{source_info}' to '{dest_path}': {e}. "
+                    "The file may be missing or inaccessible."
+                )
 
         # If participants.tsv exists in the temp BIDS root, filter to current subject
+        participants_tsv_path = os.path.join(temporary_bids_dir, "participants.tsv")
         if os.path.exists(participants_tsv_path):
             try:
                 participants_table = pd.read_csv(participants_tsv_path, sep="\t")
@@ -166,10 +176,13 @@ def _link_or_copy(src_path, dst_path):
                         participants_tsv_path,
                         sep="\t",
                         index=False,
+                        na_rep="n/a",
                     )
-            except Exception as e:  # noqa: F841
-                # Non-fatal: continue validation even if filtering fails
-                pass
+            except Exception as e:
+                logger.warning(
+                    f"Failed to filter participants.tsv for subject {subject}: {e}. "
+                    "Continuing validation without filtering."
+                )
 
         # Run the validator
         call = build_validator_call(
@@ -271,7 +284,7 @@ def validate(
             else:
                 val_tsv = str(bids_dir) + "/code/CuBIDS/" + str(output_prefix) + "_validation.tsv"
 
-            parsed.to_csv(val_tsv, sep="\t", index=False)
+            parsed.to_csv(val_tsv, sep="\t", index=False, na_rep="n/a")
 
             # build validation data dictionary json sidecar
             val_dict = get_val_dictionary()
@@ -363,7 +376,7 @@ def validate(
             else:
                 val_tsv = str(bids_dir) + "/code/CuBIDS/" + str(output_prefix) + "_validation.tsv"
 
-            parsed.to_csv(val_tsv, sep="\t", index=False)
+            parsed.to_csv(val_tsv, sep="\t", index=False, na_rep="n/a")
 
             # build validation data dictionary json sidecar
             val_dict = get_val_dictionary()
@@ -487,6 +500,7 @@ def apply(
     edited_summary_tsv,
     files_tsv,
     new_tsv_prefix,
+    n_cpus=1,
 ):
     """Apply the tsv changes.
 
@@ -525,6 +539,7 @@ def apply(
         str(files_tsv),
         str(new_tsv_prefix),
         raise_on_error=False,
+        n_cpus=n_cpus,
     )
 
 
diff --git a/docs/example.rst b/docs/example.rst
@@ -413,6 +413,14 @@ We can execute ``cubids apply`` with the following command:
 
     $ cubids apply BIDS_Dataset_DataLad v0_edited_summary.tsv v0_files.tsv v1 --use-datalad
 
+.. note::
+    For large datasets, you can speed up DataLad operations by using the ``--n-cpus`` flag
+    to enable parallel jobs for ``datalad save`` and ``datalad run`` operations. For example, to use 4 CPUs:
+
+    .. code-block:: console
+
+        $ cubids apply BIDS_Dataset_DataLad v0_edited_summary.tsv v0_files.tsv v1 --use-datalad --n-cpus 4
+
 Checking our git log, we can see that our changes from apply have been saved.
 
 .. image:: _static/screenshot_7.png