test and code corrected after review

ISMAILI Adam · ISMAILI Adam · commit 1befa054064c · 2026-02-24T13:35:41.000+01:00
diff --git a/clinica/converters/genfi_to_bids/_utils.py b/clinica/converters/genfi_to_bids/_utils.py
@@ -286,56 +286,91 @@ def _specs_depending_on_option(full: bool, gif: bool) -> str:
     return "mandatory_specs"
 
 
-def _load_clinical_data_list(cdt_path: Path, specs_df: pd.DataFrame) -> List[str]:
+def _filter_invalid_data(
+    data: str, index: int, specs_values: set, specs_df: pd.DataFrame
+) -> bool:
+    """Filter invalid (empty, unknown and not present in 'sessions') data from the user txt file.
+
+    Parameters
+    ----------
+    data: str
+        Stripped line from the user txt file.
+
+    index: int
+        Index of the line.
+
+    specs_values: Dict
+        Data set loaded from the specifications.
+
+    specs_df: pd.DataFrame
+        Dataframe loaded from the specifications.
+
+    Returns
+    -------
+    bool
+        Filtering result. True if the data line should be skipped. False otherwise.
+    """
+    # Skip empty lines
+    if not data:
+        return True
+
+    # Skip unknown lines in specs
+    if data not in specs_values:
+        log_and_warn(
+            f"Line {index}: '{data}' not found in specifications. It will be ignored.",
+            UserWarning,
+        )
+        return True
+
+    # Skip lines not in 'sessions'
+    if data not in specs_df["sessions"].values:
+        return True
+
+    return False
+
+
+def _load_clinical_data_list(cdt_path: Path) -> List[str]:
     """Load the list of clinical data fields selected by the user from a txt file.
 
     Parameters
     ----------
     cdt_path: Path
         TXT file containing the data fields the user wishes to have from the excel spreadsheets
 
-    specs_df: pd.DataFrame
-        Dataframe loaded from the specifications
-
     Returns
     -------
     List[str]
         List of selected clinical data fields
     """
     clinical_data_list = []
 
-    specs_values = {
-        str(value).strip()
-        for value in specs_df.to_numpy().ravel()
-        if pd.notna(value) and str(value).strip() != ""
-    }
+    full_specs = pd.read_csv(
+        Path(__file__).parent / "specifications/full_specs.csv",
+        sep=";",
+    )
+
+    specs_values = {value for value in full_specs.to_numpy().ravel() if pd.notna(value)}
 
     with open(cdt_path, "r", encoding="utf-8") as f:
         for i, line in enumerate(f, start=1):
             data = line.strip()
 
-            if not data:
-                continue  # Skip empty lines
-
-            if data not in specs_values:
-                log_and_warn(
-                    f"Line {i}: '{data}' not found in specifications. It will be ignored.",
-                    UserWarning,
-                )
+            if _filter_invalid_data(data, i, specs_values, full_specs):
                 continue
 
             clinical_data_list.append(data)
 
     if not clinical_data_list:
         log_and_warn(
-            "'-clinical_data_txt/cdt' is empty (no valid entries found).", UserWarning
+            f"File for option '-clinical_data_txt/cdt' at location {cdt_path} does not contain any valid entry.",
+            UserWarning,
         )
 
     return clinical_data_list
 
 
 def _merge_clinical_data_list_into_df(
-    clinical_data_list: List[str], specs_df: pd.DataFrame, df_to_complete: pd.DataFrame
+    clinical_data_list: List[str], df_to_complete: pd.DataFrame
 ) -> pd.DataFrame:
     """Merge clinical data list into the 'sessions' column of a specs like dataframe to complete.
 
@@ -344,9 +379,6 @@ def _merge_clinical_data_list_into_df(
     clinical_data_list: List[str]
         List of selected clinical data fields
 
-    specs_df: Path
-        Dataframe loaded from the specifications
-
     df_to_complete: pd.DataFrame
         Specs like dataframe to complete
 
@@ -361,16 +393,15 @@ def _merge_clinical_data_list_into_df(
         if value in sessions_values:
             continue
 
-        if value in specs_df["sessions"].values:
-            last_valid_idx = df_to_complete["sessions"].last_valid_index()
+        last_valid_idx = df_to_complete["sessions"].last_valid_index()
 
-            next_idx = last_valid_idx + 1
+        next_idx = last_valid_idx + 1
 
-            if next_idx < len(df_to_complete):
-                df_to_complete.loc[next_idx, "sessions"] = value
+        if next_idx < len(df_to_complete):
+            df_to_complete.loc[next_idx, "sessions"] = value
 
-            else:
-                df_to_complete.loc[len(df_to_complete), "sessions"] = value
+        else:
+            df_to_complete.loc[len(df_to_complete), "sessions"] = value
 
     return df_to_complete
 
@@ -426,14 +457,8 @@ def prepare_dataset_to_bids_format(
             )
 
         else:
-            full_specs = pd.read_csv(
-                Path(__file__).parent / "specifications/full_specs.csv",
-                sep=";",
-            )
-
             specifications = _merge_clinical_data_list_into_df(
-                _load_clinical_data_list(path_to_clinical_txt, full_specs),
-                full_specs,
+                _load_clinical_data_list(path_to_clinical_txt),
                 specifications.copy(),
             )
 
diff --git a/test/unittests/converters/test_genfi_to_bids_utils.py b/test/unittests/converters/test_genfi_to_bids_utils.py
@@ -560,16 +560,90 @@ def test_specs_depending_on_option(full, gif, expected):
 )
 
 
+SPECS_VALUES = {value for value in FULL_SPECS_DF.to_numpy().ravel() if pd.notna(value)}
+
+
+def test_filter_empty_data():
+    from clinica.converters.genfi_to_bids._utils import _filter_invalid_data
+
+    assert _filter_invalid_data("", 0, SPECS_VALUES, FULL_SPECS_DF)
+
+
+def test_filter_unknown_data():
+    from clinica.converters.genfi_to_bids._utils import _filter_invalid_data
+
+    data = "invalid_data_0"
+
+    expected = f"Line {0}: '{data}' not found in specifications. It will be ignored."
+
+    with pytest.warns(UserWarning, match=re.escape(expected)):
+        assert _filter_invalid_data(data, 0, SPECS_VALUES, FULL_SPECS_DF)
+
+
+@pytest.mark.parametrize(
+    ("data", "index"),
+    [
+        (
+            "source",
+            0,
+        ),
+        (
+            "blinded_code",
+            1,
+        ),
+        (
+            "bids_filename",
+            2,
+        ),
+        (
+            "bids_full_path",
+            3,
+        ),
+    ],
+)
+def test_filter_no_in_sessions_data(data, index):
+    from clinica.converters.genfi_to_bids._utils import _filter_invalid_data
+
+    assert _filter_invalid_data(data, index, SPECS_VALUES, FULL_SPECS_DF)
+
+
+@pytest.mark.parametrize(
+    ("data", "index"),
+    [
+        (
+            "genfi_version",
+            0,
+        ),
+        (
+            "aad",
+            1,
+        ),
+        (
+            "aad_1",
+            2,
+        ),
+        (
+            "aad_2",
+            3,
+        ),
+    ],
+)
+def test_filter_valid_data(data, index):
+    from clinica.converters.genfi_to_bids._utils import _filter_invalid_data
+
+    assert not _filter_invalid_data(data, index, SPECS_VALUES, FULL_SPECS_DF)
+
+
 @pytest.mark.parametrize(
     ("clinical_data_list", "expected"),
     [
         (
             ["blinded_family\n", "aad_1\n", "bids_filename"],
-            ["blinded_family", "aad_1", "bids_filename"],
+            ["aad_1"],
         ),
         (
             ["  blinded_site  \n", "\n", "aad\n", "\tbids_full_path\n"],
-            ["blinded_site", "aad", "bids_full_path"],
+            ["aad"],
         ),  # whitespaces + empty lines
     ],
 )
@@ -579,29 +653,31 @@ def test_load_clinical_data_list_success(tmp_path, clinical_data_list, expected)
     cdt_path = tmp_path / "additional_clinical_data.txt"
     cdt_path.write_text("".join(clinical_data_list), encoding="utf-8")
 
-    out = _load_clinical_data_list(cdt_path, FULL_SPECS_DF)
+    out = _load_clinical_data_list(cdt_path)
 
     assert out == expected
 
 
 @pytest.mark.parametrize(
-    ("clinical_data_list", "expected"),
+    "clinical_data_list",
     [
-        ([], "'-clinical_data_txt/cdt' is empty (no valid entries found)."),
-        (
-            ["\n", "   \n", "\t\n"],
-            "'-clinical_data_txt/cdt' is empty (no valid entries found).",
-        ),
+        [],
+        ["\n", "   \n", "\t\n"],
     ],
 )
-def test_load_clinical_data_list_empty(tmp_path, clinical_data_list, expected):
+def test_load_clinical_data_list_empty(tmp_path, clinical_data_list):
     from clinica.converters.genfi_to_bids._utils import _load_clinical_data_list
 
     cdt_path = tmp_path / "additional_clinical_data.txt"
     cdt_path.write_text("".join(clinical_data_list), encoding="utf-8")
 
+    expected = (
+        f"File for option '-clinical_data_txt/cdt' at location {cdt_path} "
+        "does not contain any valid entry."
+    )
+
     with pytest.warns(UserWarning, match=re.escape(expected)):
-        _load_clinical_data_list(cdt_path, FULL_SPECS_DF)
+        _load_clinical_data_list(cdt_path)
 
 
 def test_load_clinical_data_list_unknown_field(tmp_path):
@@ -613,7 +689,7 @@ def test_load_clinical_data_list_unknown_field(tmp_path):
     expected = "Line 2: 'false_field' not found in specifications."
 
     with pytest.warns(UserWarning, match=re.escape(expected)):
-        _load_clinical_data_list(cdt_path, FULL_SPECS_DF)
+        _load_clinical_data_list(cdt_path)
 
 
 def test_merge_clinical_data_list_into_df_in_sessions():
@@ -624,7 +700,7 @@ def test_merge_clinical_data_list_into_df_in_sessions():
     clinical_data_list = ["aad", "aad_1", "aad_2"]
 
     out = _merge_clinical_data_list_into_df(
-        clinical_data_list, FULL_SPECS_DF, TO_COMPLETE_SPECS_DF.copy()
+        clinical_data_list, TO_COMPLETE_SPECS_DF.copy()
     )
 
     expected = pd.DataFrame(
@@ -667,7 +743,7 @@ def test_merge_clinical_data_list_into_df_no_duplicate():
     clinical_data_list = ["participant_id", "session_id"]
 
     out = _merge_clinical_data_list_into_df(
-        clinical_data_list, FULL_SPECS_DF, TO_COMPLETE_SPECS_DF.copy()
+        clinical_data_list, TO_COMPLETE_SPECS_DF.copy()
     )
 
     expected = pd.DataFrame(