Skip to content

Commit 0c15916

Browse files
[ENH] GENFI-to-BIDS : More clinical data added to tabular (.tsv) files (#1641)
* mandatory + optional clinical data added to tsv files + doc updated * Doc corrected * corrections added after review * unit tests added and pre-commit modified * code corrected after review * last corrections added after review * corrections after review * gif option (re)added * gif_specs added * last small modif to help comment for the gif option * last corrections added after review for gif readdition * last corrections added --------- Co-authored-by: ISMAILI Adam <Adam ISMAILI>
1 parent ff3c9eb commit 0c15916

File tree

9 files changed

+1231
-339
lines changed

9 files changed

+1231
-339
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ repos:
1313
rev: v2.3.0
1414
hooks:
1515
- id: codespell
16+
args: ["--ignore-words-list=ALS"]
1617
additional_dependencies:
1718
- tomli
1819
- repo: https://github.com/python-poetry/poetry

clinica/converters/genfi_to_bids/_converter.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ def convert(
1313
bids_dir: UserProvidedPath,
1414
path_to_clinical: Optional[UserProvidedPath] = None,
1515
gif: Optional[bool] = False,
16+
full: Optional[bool] = False,
1617
path_to_clinical_tsv: Optional[UserProvidedPath] = None,
1718
subjects: Optional[UserProvidedPath] = None,
1819
n_procs: Optional[int] = 1,
@@ -39,6 +40,9 @@ def convert(
3940
gif: bool, optional
4041
If True, indicates the user wants to have the values of the gif parcellation
4142
43+
full: bool, optional
44+
If True, indicates the user wants to get all clinical data fields
45+
4246
path_to_clinical_tsv: Path, optional
4347
The path to a TSV file containing the additional data the user wants to have in the BIDS output.
4448
If None, no additional data will be added.
@@ -91,7 +95,9 @@ def convert(
9195
if path_to_clinical:
9296
clinical_data = parse_clinical_data(path_to_clinical)
9397
imaging_data = merge_imaging_and_clinical_data(imaging_data, clinical_data)
94-
results = prepare_dataset_to_bids_format(imaging_data, gif, path_to_clinical_tsv)
98+
results = prepare_dataset_to_bids_format(
99+
imaging_data, path_to_clinical_tsv, gif, full
100+
)
95101
write_bids(
96102
to=bids_dir,
97103
participants=results["participants"],

clinica/converters/genfi_to_bids/_utils.py

Lines changed: 123 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -135,100 +135,162 @@ def _check_file(directory: Path, pattern: str) -> Path:
135135

136136

137137
def parse_clinical_data(clinical_data_directory: Path) -> pd.DataFrame:
138-
return _complete_clinical_data(*_find_clinical_data(clinical_data_directory))
138+
return _complete_clinical_data(
139+
_find_clinical_data(
140+
clinical_data_directory,
141+
"FINAL*IMAGING*.xlsx",
142+
),
143+
[
144+
_find_clinical_data(clinical_data_directory, pattern)
145+
for pattern in (
146+
"FINAL*DEMOGRAPHICS*.xlsx",
147+
"FINAL*CLINICAL*.xlsx",
148+
"FINAL*BIOSAMPLES*.xlsx",
149+
"FINAL*NEUROPSYCH*.xlsx",
150+
"FINAL*GENETICS*.xlsx",
151+
)
152+
],
153+
)
139154

140155

141156
def _find_clinical_data(
142157
clinical_data_directory: Path,
143-
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
144-
"""Finds the clinical data associated with the dataset.
158+
pattern: str,
159+
) -> pd.DataFrame:
160+
"""Finds the clinical data associated with the pattern in the cdd.
145161
146162
Parameters
147163
----------
148164
clinical_data_directory: Path
149165
The path to the clinical data.
150166
167+
pattern : str
168+
Pattern to find the clinical data
169+
151170
Returns
152171
-------
153-
List[pd.DataFrame]
154-
Dataframes containing the clinical data
172+
pd.DataFrame
173+
Dataframe containing the clinical data
155174
"""
156-
cprint("Looking for clinical data.", lvl="info")
175+
cprint("Looking for clinical data...", lvl="info")
157176

158-
return tuple(
159-
_read_file(_check_file(clinical_data_directory, pattern))
160-
for pattern in (
161-
"FINAL*DEMOGRAPHICS*.xlsx",
162-
"FINAL*IMAGING*.xlsx",
163-
"FINAL*CLINICAL*.xlsx",
164-
"FINAL*BIOSAMPLES*.xlsx",
165-
"FINAL*NEUROPSYCH*.xlsx",
166-
)
167-
)
177+
return _read_file(_check_file(clinical_data_directory, pattern))
168178

169179

170180
def _read_file(data_file: Path) -> pd.DataFrame:
181+
df_genfi_1 = pd.read_excel(data_file)
182+
df_genfi_2 = (
183+
pd.read_excel(data_file, sheet_name=1)
184+
if "GENETICS" not in data_file.name
185+
else pd.DataFrame()
186+
)
187+
171188
return (
172189
pd.concat(
173190
[
174-
pd.read_excel(data_file),
175-
pd.read_excel(data_file, sheet_name=1),
191+
df_genfi_1,
192+
df_genfi_2,
176193
]
177194
)
178195
.convert_dtypes()
179196
.rename(columns=lambda x: x.lower().replace(" ", "_"))
180197
)
181198

182199

200+
def _merge_and_coalesce(
201+
left_df: pd.DataFrame,
202+
right_df: pd.DataFrame,
203+
on: List[str],
204+
) -> pd.DataFrame:
205+
"""
206+
Sub-function to outer merge and coalesce duplicates :
207+
- Keeps all keys from both left_df and right_df
208+
- For overlapping non-key columns, fills left_df n/a values with right_df actual values
209+
"""
210+
# Drops rows without keys
211+
left_df = left_df.copy().dropna(subset=on)
212+
right_df = right_df.copy().dropna(subset=on)
213+
214+
# Outer merge dfs
215+
merged_df = left_df.merge(
216+
right_df,
217+
how="outer",
218+
on=on,
219+
suffixes=("", "_duplicate"),
220+
)
221+
222+
# Coalesces overlapping columns
223+
for col in right_df.columns:
224+
if (dup_col := f"{col}_duplicate") in merged_df.columns:
225+
merged_df[col] = merged_df[col].combine_first(
226+
merged_df[dup_col]
227+
) # Fills n/a in left_df with right_df values
228+
merged_df.drop(columns=[dup_col], inplace=True)
229+
230+
return merged_df
231+
232+
183233
def _complete_clinical_data(
184-
df_demographics: pd.DataFrame,
185234
df_imaging: pd.DataFrame,
186-
df_clinical: pd.DataFrame,
187-
df_biosamples: pd.DataFrame,
188-
df_neuropsych: pd.DataFrame,
235+
df_clinical_list: List[pd.DataFrame],
189236
) -> pd.DataFrame:
190237
"""Merges the different clincal dataframes into one.
191238
192239
Parameters
193240
----------
194-
df_demographics: pd.DataFrame
195-
Dataframe containing the demographic data
196-
197241
df_imaging: pd.DataFrame
198242
Dataframe containing the imaging data
199243
200-
df_clinical: pd.DataFrame
201-
Dataframe containing the clinical data
202-
203-
df_biosamples: pd.DataFrame
204-
Dataframe containing the biosample data
205-
206-
df_neuropsych: pd.DataFrame
207-
Dataframe containing the neuropsych data
244+
df_clinical_list: List[pd.DataFrame]
245+
List of dataframes containing the remaining clinical data
208246
209247
Returns
210248
-------
211249
df_clinical_complete: pd.DataFrame
212-
Dataframe with the data of the 3 input dataframes
250+
Dataframe with the data from the input dataframes
213251
"""
214252
merge_key = ["blinded_code", "blinded_site", "visit"]
215-
df_clinical_complete = df_imaging.merge(
216-
df_demographics, how="inner", on=merge_key
217-
).drop(columns="diagnosis")
218-
df_clinical_complete = df_clinical_complete.merge(
219-
df_biosamples, how="inner", on=merge_key
220-
)
221-
df_clinical_complete = df_clinical_complete.merge(
222-
df_neuropsych, how="inner", on=merge_key
223-
)
224-
df_clinical = df_clinical.dropna(subset=merge_key)
225-
return df_clinical_complete.merge(df_clinical, how="inner", on=merge_key)
253+
254+
df_clinical_complete = df_imaging.copy()
255+
256+
for df in df_clinical_list:
257+
df_clinical_complete = _merge_and_coalesce(
258+
df_clinical_complete, df, on=merge_key
259+
)
260+
261+
return df_clinical_complete
262+
263+
264+
def _specs_depending_on_option(full: bool, gif: bool) -> str:
265+
"""Returns specs filename to use based on optional values.
266+
267+
Parameters
268+
----------
269+
full: bool
270+
If True, returns full specs filename
271+
272+
gif: bool
273+
If True, returns gif specs filename
274+
275+
Returns
276+
-------
277+
[specs_filename]: str
278+
Option-based specs filename
279+
"""
280+
if full:
281+
return "full_specs"
282+
283+
if gif:
284+
return "gif_specs"
285+
286+
return "mandatory_specs"
226287

227288

228289
def prepare_dataset_to_bids_format(
229290
complete_data_df: pd.DataFrame,
230-
gif: bool,
231291
path_to_clinical_tsv: Path,
292+
gif: bool = False,
293+
full: bool = False,
232294
) -> Dict[str, pd.DataFrame]:
233295
"""Selects the data needed to write the participants, sessions, and scans tsvs.
234296
@@ -237,26 +299,35 @@ def prepare_dataset_to_bids_format(
237299
complete_data_df: pd.DataFrame
238300
Dataframe containing the merged data extracted from the raw images and the clinical data
239301
240-
gif: bool
241-
If True, indicates the user wants to have the values of the gif parcellation
242-
243302
path_to_clinical_tsv: Path
244303
TSV file containing the data fields the user wishes to have from the excel spreadsheets
245304
305+
gif: bool
306+
False by default. If True, indicates the user wants to get all clinical data fields
307+
308+
full: bool
309+
False by default. If True, indicates the user wants to get all clinical data fields
310+
246311
Returns
247312
-------
248313
Dict[str, pd.DataFrame]
249314
Dictionary containing as key participants, sessions and scans, and the values wanted for each tsv
250315
"""
316+
251317
complete_data_df = complete_data_df.drop_duplicates(
252318
subset=["participant_id", "session_id", "modality", "run_num", "bids_filename"]
253319
).set_index(
254320
["participant_id", "session_id", "modality", "run_num", "bids_filename"],
255321
verify_integrity=True,
256322
)
257-
specifications = pd.read_csv(Path(__file__).parent / "specifications.csv", sep=";")
258-
if not gif:
259-
specifications = specifications.head(8)
323+
324+
specifications = pd.read_csv(
325+
Path(__file__).parent
326+
/ "specifications"
327+
/ f"{_specs_depending_on_option(full, gif)}.csv",
328+
sep=";",
329+
)
330+
260331
# add additional data through csv
261332
if path_to_clinical_tsv:
262333
additional_data_df = pd.read_csv(path_to_clinical_tsv, sep="\t")

clinica/converters/genfi_to_bids/cli.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,17 @@
1010
"--clinical-data-dir",
1111
"clinical_data_directory",
1212
type=click.Path(exists=True, file_okay=False, resolve_path=True),
13-
help="Path to the clinical data directory",
13+
help="Path to the clinical data directory.",
14+
)
15+
16+
gif = click.option("-gif", is_flag=True, help="Add values from gif to session.tsv.")
17+
18+
full = click.option(
19+
"-full",
20+
is_flag=True,
21+
help="Add all clinical data (mandatory + optional) to sessions.tsv.",
1422
)
1523

16-
gif = click.option("-gif", is_flag=True, help="Add values from gif to session.tsv")
1724
clinical_data_tsv = click.option(
1825
"-cdt",
1926
"--clinical-data-tsv",
@@ -28,13 +35,15 @@
2835
@cli_param.bids_directory
2936
@clinical_data_directory
3037
@gif
38+
@full
3139
@clinical_data_tsv
3240
def cli(
3341
dataset_directory: PathLike,
3442
bids_directory: PathLike,
3543
clinical_data_directory: Optional[PathLike] = None,
3644
clinical_data_tsv: Optional[PathLike] = None,
3745
gif: Optional[bool] = False,
46+
full: Optional[bool] = False,
3847
) -> None:
3948
"""GENFI to BIDS converter.
4049
@@ -48,6 +57,7 @@ def cli(
4857
bids_directory,
4958
clinical_data_directory,
5059
gif=gif,
60+
full=full,
5161
path_to_clinical_tsv=clinical_data_tsv,
5262
)
5363

0 commit comments

Comments
 (0)