Skip to content

Commit 093bca2

Browse files
authored
[REF] UKB : Modify tsv writing to fit BIDS specifications (aramis-lab#1526)
* Split write-bids function * Add write function specific to images * WIP 1206 * get_ext_from_sidecars * write_scans * Unit test for participants * Add test for sessions * Add test for scans * Add UKB to nonregression
1 parent 8a79ccb commit 093bca2

File tree

3 files changed

+171
-91
lines changed

3 files changed

+171
-91
lines changed

clinica/converters/ukb_to_bids/_utils.py

Lines changed: 73 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@
33

44
import numpy as np
55
import pandas as pd
6+
from fsspec.implementations.local import LocalFileSystem
7+
8+
from clinica.converters._utils import write_to_tsv
9+
from clinica.dataset.bids._filename import Extension
10+
from clinica.utils.stream import cprint
611

712
__all__ = [
813
"find_clinical_data",
@@ -303,17 +308,21 @@ def write_bids(
303308
scans: pd.DataFrame,
304309
dataset_directory: Path,
305310
) -> None:
306-
from fsspec.implementations.local import LocalFileSystem
311+
fs = LocalFileSystem(auto_mkdir=True)
312+
_write_description_and_participants(participants, to, fs)
313+
_write_sessions(sessions, to, fs)
314+
_write_images(scans, to, dataset_directory, fs)
315+
_write_scans(scans, to)
307316

308-
from clinica.converters._utils import write_to_tsv
317+
318+
def _write_description_and_participants(
319+
participants: pd.DataFrame, to: Path, fs: LocalFileSystem
320+
):
309321
from clinica.converters.study_models import StudyName
310322
from clinica.dataset import BIDSDatasetDescription
311323

312-
fs = LocalFileSystem(auto_mkdir=True)
313-
314-
participants = participants.droplevel(
315-
["sessions", "modality", "bids_filename"]
316-
).drop_duplicates()
324+
participants = participants.droplevel(["sessions", "modality", "bids_filename"])
325+
participants = participants.loc[~participants.index.duplicated(keep="first")]
317326

318327
# Ensure BIDS hierarchy is written first.
319328
with fs.transaction:
@@ -326,72 +335,79 @@ def write_bids(
326335
with fs.open(str(to / "participants.tsv"), "w") as participant_file:
327336
write_to_tsv(participants, participant_file)
328337

338+
339+
def _write_sessions(sessions: pd.DataFrame, to: Path, fs: LocalFileSystem):
329340
for participant_id, data_frame in sessions.groupby("participant_id"):
330-
sessions = data_frame.droplevel(
341+
sessions_to_write = data_frame.droplevel(
331342
["participant_id", "modality", "bids_filename"]
332343
).drop_duplicates()
333-
344+
sessions_to_write.index.name = "session_id"
334345
sessions_filepath = to / str(participant_id) / f"{participant_id}_sessions.tsv"
335346
with fs.open(str(sessions_filepath), "w") as sessions_file:
336-
write_to_tsv(sessions, sessions_file)
347+
write_to_tsv(sessions_to_write, sessions_file)
337348

338-
scans = scans.reset_index().set_index(["bids_full_path"], verify_integrity=True)
339349

350+
def _write_images(scans: pd.DataFrame, to: Path, source: Path, fs: LocalFileSystem):
351+
scans = scans.reset_index().set_index(["bids_full_path"], verify_integrity=True)
340352
for bids_full_path, metadata in scans.iterrows():
341353
if metadata["modality_num"] != "20217" and metadata["modality_num"] != "20225":
342354
_copy_file_to_bids(
343-
zipfile=dataset_directory / metadata["source_zipfile"],
355+
zipfile=source / metadata["source_zipfile"],
344356
filenames=[metadata["source_filename"]] + metadata["sidecars"],
345357
bids_path=to / bids_full_path,
346358
)
347359
else:
348360
_convert_dicom_to_nifti(
349-
zipfiles=dataset_directory / metadata["source_zipfile"],
361+
zipfiles=source / metadata["source_zipfile"],
350362
bids_path=to / bids_full_path,
363+
fs=fs,
351364
)
352365
if metadata["modality_num"] == "20217":
353-
_import_event_tsv(bids_path=to)
354-
355-
_write_row_in_scans_tsv_file(metadata, to)
356-
357-
358-
def _write_row_in_scans_tsv_file(row: pd.Series, to: Path):
359-
"""Write rows from a dataframe into a scans.tsv file.
366+
_import_event_tsv(bids_path=to, fs=fs)
360367

361-
Parameters
362-
----------
363-
row : pd.Series
364-
Row to write into the scans.tsv file.
365-
366-
to : Path
367-
Path to the BIDS folder.
368-
"""
369-
scans_filepath = (
370-
to
371-
/ str(row.participant_id)
372-
/ str(row.sessions)
373-
/ f"{row.participant_id}_{row.sessions}_scans.tsv"
374-
)
375-
row_to_write = _serialize_row(
376-
row.drop(["participant_id", "sessions"]),
377-
write_column_names=not scans_filepath.exists(),
378-
)
379-
with open(scans_filepath, "a") as scans_file:
380-
scans_file.write(f"{row_to_write}\n")
381368

369+
def _get_extensions_from_sidecars(sidecars: list[str]) -> list[str]:
370+
extensions = []
371+
for side in sidecars:
372+
try:
373+
extensions += [Extension("." + side.split(".")[1])]
374+
except (ValueError, IndexError) as e:
375+
cprint(
376+
"An invalid extension for bids files was found and won't be registered in scans.tsv. Please check your files.",
377+
lvl="warning",
378+
)
379+
return extensions + [Extension(".nii.gz")]
382380

383-
def _serialize_row(row: pd.Series, write_column_names: bool) -> str:
384-
row_dict = row.to_dict()
385-
to_write = (
386-
[row_dict.keys(), row_dict.values()]
387-
if write_column_names
388-
else [row_dict.values()]
389-
)
390-
return "\n".join([_serialize_list(list(_)) for _ in to_write])
391381

382+
def _write_scans(scans: pd.DataFrame, to: Path) -> None:
383+
for subject_session, data in scans.groupby(["participant_id", "sessions"]):
384+
data["filename_no_extension"] = data["bids_full_path"].apply(
385+
lambda x: f"{Path(x).parent.name}/{Path(x).name}"
386+
)
387+
data["extensions"] = data["sidecars"].apply(
388+
lambda x: _get_extensions_from_sidecars(x)
389+
)
392390

393-
def _serialize_list(data: list, sep="\t") -> str:
394-
return sep.join([str(value) for value in data])
391+
to_write = pd.DataFrame(columns=["filename"])
392+
393+
for _, line in data.iterrows():
394+
for extension in line.extensions:
395+
to_write = pd.concat(
396+
[
397+
to_write,
398+
pd.DataFrame(
399+
{"filename": [line.filename_no_extension + extension]}
400+
),
401+
]
402+
)
403+
to_write.to_csv(
404+
to
405+
/ subject_session[0]
406+
/ subject_session[1]
407+
/ f"{subject_session[0]}_{subject_session[1]}_scans.tsv",
408+
sep="\t",
409+
index=False,
410+
)
395411

396412

397413
def _copy_file_to_bids(zipfile: Path, filenames: List[Path], bids_path: Path) -> None:
@@ -408,7 +424,9 @@ def _copy_file_to_bids(zipfile: Path, filenames: List[Path], bids_path: Path) ->
408424
f.write(fs.cat(filename))
409425

410426

411-
def _convert_dicom_to_nifti(zipfiles: Path, bids_path: Path) -> None:
427+
def _convert_dicom_to_nifti(
428+
zipfiles: Path, bids_path: Path, fs: LocalFileSystem
429+
) -> None:
412430
"""Install the requested files in the BIDS dataset.
413431
First, the dicom is extracted in a temporary directory
414432
Second, the dicom extracted is converted in the right place using dcm2niix"""
@@ -418,10 +436,6 @@ def _convert_dicom_to_nifti(zipfiles: Path, bids_path: Path) -> None:
418436
import zipfile
419437
from pathlib import PurePath
420438

421-
from fsspec.implementations.local import LocalFileSystem
422-
423-
fs = LocalFileSystem(auto_mkdir=True)
424-
425439
zf = zipfile.ZipFile(zipfiles)
426440
try:
427441
bids_path.parent.mkdir(exist_ok=True, parents=True)
@@ -430,15 +444,9 @@ def _convert_dicom_to_nifti(zipfiles: Path, bids_path: Path) -> None:
430444
pass
431445
with tempfile.TemporaryDirectory() as tempdir:
432446
zf.extractall(tempdir)
433-
command = [
434-
"dcm2niix",
435-
"-w",
436-
"0",
437-
]
438-
command += ["-9", "-z", "y"]
439-
command += ["-b", "y", "-ba", "y"]
440-
command += [tempdir]
441-
subprocess.run(command)
447+
subprocess.run(
448+
["dcm2niix", "-w", "0", "-9", "-z", "y", "-b", "y", "-ba", "y", tempdir]
449+
)
442450
fmri_image_path = _find_largest_imaging_data(Path(tempdir))
443451
fmri_image_path = fmri_image_path or ""
444452
fs.copy(str(fmri_image_path), str(bids_path) + ".nii.gz")
@@ -485,11 +493,8 @@ def _select_sessions(x: pd.Series) -> Optional[float]:
485493
return None
486494

487495

488-
def _import_event_tsv(bids_path: Path) -> None:
496+
def _import_event_tsv(bids_path: Path, fs: LocalFileSystem) -> None:
489497
"""Import the csv containing the events' information."""
490-
from fsspec.implementations.local import LocalFileSystem
491-
492-
fs = LocalFileSystem(auto_mkdir=True)
493498
event_tsv = (
494499
Path(__file__).parents[2]
495500
/ "resources"

test/nonregression/test_run_converters.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,5 +42,6 @@ def test_converters(cmdopt, tmp_path, study: StudyName):
4242
StudyName.NIFD,
4343
StudyName.OASIS3,
4444
StudyName.GENFI,
45+
StudyName.UKB,
4546
):
4647
compare_bids_tsv(output_dir, ref_dir / "bids")

test/unittests/converters/test_ukb_to_bids_utils.py

Lines changed: 97 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import pandas as pd
55
import pytest
6+
from fsspec.implementations.local import LocalFileSystem
67

78

89
def test_read_imaging_data(tmp_path):
@@ -20,33 +21,20 @@ def test_read_imaging_data(tmp_path):
2021
read_imaging_data(path_to_zip)
2122

2223

23-
def test_write_row_in_scans_tsv_file(tmp_path):
24-
from clinica.converters.ukb_to_bids._utils import _write_row_in_scans_tsv_file
25-
26-
row = pd.Series(
27-
{
28-
"participant_id": "sub-0001",
29-
"sessions": "ses-M000",
30-
"filename": "sub-0001_ses-M000_T1w.nii.gz",
31-
"modality": "T1w",
32-
}
33-
)
34-
35-
target_dir = tmp_path / "BIDS" / "sub-0001" / "ses-M000"
36-
target_dir.mkdir(parents=True)
37-
38-
_write_row_in_scans_tsv_file(row, tmp_path / "BIDS")
24+
@pytest.mark.parametrize(
25+
"sidecars, expected", [([], {".nii.gz"}), (["truc.json"], {".nii.gz", ".json"})]
26+
)
27+
def test_get_extensions_from_sidecars_success(sidecars, expected):
28+
from clinica.converters.ukb_to_bids._utils import _get_extensions_from_sidecars
3929

40-
scans_tsv = target_dir / "sub-0001_ses-M000_scans.tsv"
41-
assert scans_tsv.exists()
30+
assert expected == set(_get_extensions_from_sidecars(sidecars))
4231

43-
content = scans_tsv.read_text().strip().splitlines()
4432

45-
columns_names = content[0].split("\t")
46-
columns_items = content[1].split("\t")
33+
@pytest.mark.parametrize("sidecars", [["foo"], [".bar"]])
34+
def test_get_extensions_from_sidecars_error(sidecars):
35+
from clinica.converters.ukb_to_bids._utils import _get_extensions_from_sidecars
4736

48-
assert columns_names == ["filename", "modality"]
49-
assert columns_items == ["sub-0001_ses-M000_T1w.nii.gz", "T1w"]
37+
assert _get_extensions_from_sidecars(sidecars) == [".nii.gz"]
5038

5139

5240
@pytest.mark.parametrize(
@@ -72,3 +60,89 @@ def test_select_sessions(subject_id, source_session, age_2, age_3, expected):
7260
)
7361

7462
assert expected == _select_sessions(clinical_data)
63+
64+
65+
def test_write_description_and_participants(tmp_path):
66+
from clinica.converters.ukb_to_bids._utils import (
67+
_write_description_and_participants,
68+
)
69+
70+
to = tmp_path / "BIDS"
71+
participants = pd.DataFrame(
72+
{
73+
"participants": ["1", "2", "2"],
74+
"sessions": ["ses-M000", "ses-M000", "ses-M001"],
75+
"modality": ["dwi", "dwi", "dwi"],
76+
"bids_filename": ["1-0-dwi", "2-0-dwi", "2-1-dwi"],
77+
"sex": ["F", "F", "F"],
78+
}
79+
)
80+
participants.set_index(
81+
["participants", "sessions", "modality", "bids_filename"], inplace=True
82+
)
83+
_write_description_and_participants(
84+
participants, to, LocalFileSystem(auto_mkdir=True)
85+
)
86+
87+
tsv_files = list(to.rglob("*tsv"))
88+
json_files = list(to.rglob("*json"))
89+
90+
assert len(tsv_files) == 1
91+
assert len(json_files) == 1
92+
93+
tsv = pd.read_csv(tsv_files[0], sep="\t")
94+
assert set(tsv.columns) == {"participants", "sex"}
95+
assert len(tsv) == 2
96+
97+
98+
def test_write_sessions(tmp_path):
99+
from clinica.converters.ukb_to_bids._utils import _write_sessions
100+
101+
to = tmp_path / "BIDS"
102+
103+
sessions = pd.DataFrame(
104+
{
105+
"participant_id": ["1", "2", "2"],
106+
"sessions": ["ses-M000", "ses-M000", "ses-M001"],
107+
"modality": ["dwi", "dwi", "dwi"],
108+
"bids_filename": ["1-0-dwi", "2-0-dwi", "2-1-dwi"],
109+
"session_identifier": ["2", "2", "3"],
110+
}
111+
)
112+
sessions.set_index(
113+
["participant_id", "sessions", "modality", "bids_filename"], inplace=True
114+
)
115+
116+
_write_sessions(sessions, to, LocalFileSystem(auto_mkdir=True))
117+
tsv_files = list(to.rglob("*tsv"))
118+
119+
assert len(tsv_files) == 2
120+
121+
tsv = pd.read_csv(to / "2" / "2_sessions.tsv", sep="\t")
122+
assert len(tsv) == 2
123+
124+
125+
def test_write_scans(tmp_path):
126+
from clinica.converters.ukb_to_bids._utils import _write_scans
127+
128+
to = tmp_path / "BIDS"
129+
(to / "sub-001" / "ses-M000").mkdir(parents=True, exist_ok=True)
130+
scans = pd.DataFrame(
131+
pd.DataFrame(
132+
{
133+
"participant_id": ["sub-001"],
134+
"sessions": ["ses-M000"],
135+
"modality": ["T1w"],
136+
"bids_filename": ["sub-001_ses-M000_T1w"],
137+
"bids_full_path": [
138+
to / "sub-001" / "ses-M000" / "sub-001_ses-M000_T1w"
139+
],
140+
"sidecars": [["truc.json"]],
141+
}
142+
)
143+
)
144+
_write_scans(scans, to)
145+
tsv_files = list(to.rglob("*tsv"))
146+
assert len(tsv_files) == 1
147+
tsv = pd.read_csv(tsv_files[0], sep="\t")
148+
assert len(tsv) == 2

0 commit comments

Comments
 (0)