@@ -135,100 +135,162 @@ def _check_file(directory: Path, pattern: str) -> Path:
135135
136136
137137def parse_clinical_data (clinical_data_directory : Path ) -> pd .DataFrame :
138- return _complete_clinical_data (* _find_clinical_data (clinical_data_directory ))
138+ return _complete_clinical_data (
139+ _find_clinical_data (
140+ clinical_data_directory ,
141+ "FINAL*IMAGING*.xlsx" ,
142+ ),
143+ [
144+ _find_clinical_data (clinical_data_directory , pattern )
145+ for pattern in (
146+ "FINAL*DEMOGRAPHICS*.xlsx" ,
147+ "FINAL*CLINICAL*.xlsx" ,
148+ "FINAL*BIOSAMPLES*.xlsx" ,
149+ "FINAL*NEUROPSYCH*.xlsx" ,
150+ "FINAL*GENETICS*.xlsx" ,
151+ )
152+ ],
153+ )
139154
140155
141156def _find_clinical_data (
142157 clinical_data_directory : Path ,
143- ) -> tuple [pd .DataFrame , pd .DataFrame , pd .DataFrame , pd .DataFrame , pd .DataFrame ]:
144- """Finds the clinical data associated with the dataset.
158+ pattern : str ,
159+ ) -> pd .DataFrame :
160+ """Finds the clinical data associated with the pattern in the cdd.
145161
146162 Parameters
147163 ----------
148164 clinical_data_directory: Path
149165 The path to the clinical data.
150166
167+ pattern : str
168+ Pattern to find the clinical data
169+
151170 Returns
152171 -------
153- List[ pd.DataFrame]
154- Dataframes containing the clinical data
172+ pd.DataFrame
173+ Dataframe containing the clinical data
155174 """
156- cprint ("Looking for clinical data." , lvl = "info" )
175+ cprint ("Looking for clinical data... " , lvl = "info" )
157176
158- return tuple (
159- _read_file (_check_file (clinical_data_directory , pattern ))
160- for pattern in (
161- "FINAL*DEMOGRAPHICS*.xlsx" ,
162- "FINAL*IMAGING*.xlsx" ,
163- "FINAL*CLINICAL*.xlsx" ,
164- "FINAL*BIOSAMPLES*.xlsx" ,
165- "FINAL*NEUROPSYCH*.xlsx" ,
166- )
167- )
177+ return _read_file (_check_file (clinical_data_directory , pattern ))
168178
169179
170180def _read_file (data_file : Path ) -> pd .DataFrame :
181+ df_genfi_1 = pd .read_excel (data_file )
182+ df_genfi_2 = (
183+ pd .read_excel (data_file , sheet_name = 1 )
184+ if "GENETICS" not in data_file .name
185+ else pd .DataFrame ()
186+ )
187+
171188 return (
172189 pd .concat (
173190 [
174- pd . read_excel ( data_file ) ,
175- pd . read_excel ( data_file , sheet_name = 1 ) ,
191+ df_genfi_1 ,
192+ df_genfi_2 ,
176193 ]
177194 )
178195 .convert_dtypes ()
179196 .rename (columns = lambda x : x .lower ().replace (" " , "_" ))
180197 )
181198
182199
200+ def _merge_and_coalesce (
201+ left_df : pd .DataFrame ,
202+ right_df : pd .DataFrame ,
203+ on : List [str ],
204+ ) -> pd .DataFrame :
205+ """
206+ Sub-function to outer merge and coalesce duplicates :
207+ - Keeps all keys from both left_df and right_df
208+ - For overlapping non-key columns, fills left_df n/a values with right_df actual values
209+ """
210+ # Drops rows without keys
211+ left_df = left_df .copy ().dropna (subset = on )
212+ right_df = right_df .copy ().dropna (subset = on )
213+
214+ # Outer merge dfs
215+ merged_df = left_df .merge (
216+ right_df ,
217+ how = "outer" ,
218+ on = on ,
219+ suffixes = ("" , "_duplicate" ),
220+ )
221+
222+ # Coalesces overlapping columns
223+ for col in right_df .columns :
224+ if (dup_col := f"{ col } _duplicate" ) in merged_df .columns :
225+ merged_df [col ] = merged_df [col ].combine_first (
226+ merged_df [dup_col ]
227+ ) # Fills n/a in left_df with right_df values
228+ merged_df .drop (columns = [dup_col ], inplace = True )
229+
230+ return merged_df
231+
232+
183233def _complete_clinical_data (
184- df_demographics : pd .DataFrame ,
185234 df_imaging : pd .DataFrame ,
186- df_clinical : pd .DataFrame ,
187- df_biosamples : pd .DataFrame ,
188- df_neuropsych : pd .DataFrame ,
235+ df_clinical_list : List [pd .DataFrame ],
189236) -> pd .DataFrame :
190237 """Merges the different clincal dataframes into one.
191238
192239 Parameters
193240 ----------
194- df_demographics: pd.DataFrame
195- Dataframe containing the demographic data
196-
197241 df_imaging: pd.DataFrame
198242 Dataframe containing the imaging data
199243
200- df_clinical: pd.DataFrame
201- Dataframe containing the clinical data
202-
203- df_biosamples: pd.DataFrame
204- Dataframe containing the biosample data
205-
206- df_neuropsych: pd.DataFrame
207- Dataframe containing the neuropsych data
244+ df_clinical_list: List[pd.DataFrame]
245+ List of dataframes containing the remaining clinical data
208246
209247 Returns
210248 -------
211249 df_clinical_complete: pd.DataFrame
212- Dataframe with the data of the 3 input dataframes
250+ Dataframe with the data from the input dataframes
213251 """
214252 merge_key = ["blinded_code" , "blinded_site" , "visit" ]
215- df_clinical_complete = df_imaging .merge (
216- df_demographics , how = "inner" , on = merge_key
217- ).drop (columns = "diagnosis" )
218- df_clinical_complete = df_clinical_complete .merge (
219- df_biosamples , how = "inner" , on = merge_key
220- )
221- df_clinical_complete = df_clinical_complete .merge (
222- df_neuropsych , how = "inner" , on = merge_key
223- )
224- df_clinical = df_clinical .dropna (subset = merge_key )
225- return df_clinical_complete .merge (df_clinical , how = "inner" , on = merge_key )
253+
254+ df_clinical_complete = df_imaging .copy ()
255+
256+ for df in df_clinical_list :
257+ df_clinical_complete = _merge_and_coalesce (
258+ df_clinical_complete , df , on = merge_key
259+ )
260+
261+ return df_clinical_complete
262+
263+
264+ def _specs_depending_on_option (full : bool , gif : bool ) -> str :
265+ """Returns specs filename to use based on optional values.
266+
267+ Parameters
268+ ----------
269+ full: bool
270+ If True, returns full specs filename
271+
272+ gif: bool
273+ If True, returns gif specs filename
274+
275+ Returns
276+ -------
277+ [specs_filename]: str
278+ Option-based specs filename
279+ """
280+ if full :
281+ return "full_specs"
282+
283+ if gif :
284+ return "gif_specs"
285+
286+ return "mandatory_specs"
226287
227288
228289def prepare_dataset_to_bids_format (
229290 complete_data_df : pd .DataFrame ,
230- gif : bool ,
231291 path_to_clinical_tsv : Path ,
292+ gif : bool = False ,
293+ full : bool = False ,
232294) -> Dict [str , pd .DataFrame ]:
233295 """Selects the data needed to write the participants, sessions, and scans tsvs.
234296
@@ -237,26 +299,35 @@ def prepare_dataset_to_bids_format(
237299 complete_data_df: pd.DataFrame
238300 Dataframe containing the merged data extracted from the raw images and the clinical data
239301
240- gif: bool
241- If True, indicates the user wants to have the values of the gif parcellation
242-
243302 path_to_clinical_tsv: Path
244303 TSV file containing the data fields the user wishes to have from the excel spreadsheets
245304
305+ gif: bool
306+ False by default. If True, indicates the user wants to get all clinical data fields
307+
308+ full: bool
309+ False by default. If True, indicates the user wants to get all clinical data fields
310+
246311 Returns
247312 -------
248313 Dict[str, pd.DataFrame]
249314 Dictionary containing as key participants, sessions and scans, and the values wanted for each tsv
250315 """
316+
251317 complete_data_df = complete_data_df .drop_duplicates (
252318 subset = ["participant_id" , "session_id" , "modality" , "run_num" , "bids_filename" ]
253319 ).set_index (
254320 ["participant_id" , "session_id" , "modality" , "run_num" , "bids_filename" ],
255321 verify_integrity = True ,
256322 )
257- specifications = pd .read_csv (Path (__file__ ).parent / "specifications.csv" , sep = ";" )
258- if not gif :
259- specifications = specifications .head (8 )
323+
324+ specifications = pd .read_csv (
325+ Path (__file__ ).parent
326+ / "specifications"
327+ / f"{ _specs_depending_on_option (full , gif )} .csv" ,
328+ sep = ";" ,
329+ )
330+
260331 # add additional data through csv
261332 if path_to_clinical_tsv :
262333 additional_data_df = pd .read_csv (path_to_clinical_tsv , sep = "\t " )
0 commit comments