Support array-type metadata fields in cubids group (#407)

tsalo · web-flow · commit f7e5af4550ab · 2025-04-04T15:48:36.000-04:00
* Support array-type metadata fields. * Try supporting lists of strings too. * Add test. * Update test_utils.py * Update stuff. * Update test_utils.py * Rename format_params to cluster_single_parameters. * Keep working. * Move cluster_single_parameters from cubids to utils. * Fix import. * Remove unused function. * Update utils.py * Update test_utils.py * Update round_params too. * Update test. * Update. * Update. * Fix possible bug from #439. If you have two unique values (NaNs and some actual value), it would label everything as cluster 0, but it should probably label the actual values as 0 and the NaNs as 1. * Allow ndarray metadata. * Add ImageOrientationPatientDICOM, remove Obliquity * Remove obliquity mentions. * Revert obliquity-related changes. * Update test_bond.py * Update example.rst
diff --git a/cubids/tests/test_cubids.py b/cubids/tests/test_cubids.py
@@ -449,21 +449,6 @@ def _test__get_param_groups(cubids_instance):
     # Add assertions here
 
 
-def _test_round_params(cubids_instance):
-    """Test rounding parameters.
-
-    Parameters
-    ----------
-    cubids_instance : CuBIDS
-        An instance of the CuBIDS class.
-    """
-    param_group_df = pd.DataFrame({"param": [0.123456789]})
-    config = {"param": {"round": 3}}
-    modality = "bold"
-    rounded_params = cubids_instance.round_params(param_group_df, config, modality)
-    # Add assertions here
-
-
 def _test_get_sidecar_metadata(cubids_instance):
     """Test getting sidecar metadata.
 
@@ -477,21 +462,6 @@ def _test_get_sidecar_metadata(cubids_instance):
     # Add assertions here
 
 
-def _test_format_params(cubids_instance):
-    """Test formatting parameters.
-
-    Parameters
-    ----------
-    cubids_instance : CuBIDS
-        An instance of the CuBIDS class.
-    """
-    param_group_df = pd.DataFrame({"param": [0.123456789]})
-    config = {"param": {"format": "{:.2f}"}}
-    modality = "bold"
-    formatted_params = cubids_instance.format_params(param_group_df, config, modality)
-    # Add assertions here
-
-
 def _test__order_columns(cubids_instance):
     """Test ordering columns.
 
diff --git a/cubids/tests/test_utils.py b/cubids/tests/test_utils.py
@@ -0,0 +1,184 @@
+"""Tests for the utils module."""
+
+import pandas as pd
+
+from cubids import utils
+from cubids.tests.utils import compare_group_assignments
+
+
+def test_round_params():
+    """Test the cubids.utils.round_params function."""
+    # Example DataFrame
+    df = pd.DataFrame(
+        {
+            "A": [1.12345, 2.23456, 3.34567],
+            "B": [[1.12345, 2.23456], [3.34567, 4.45678], [5.56789, 6.67890]],
+            "C": ["text", "more text", "even more text"],
+            "D": [1.12345, 2.23456, 3.34567],
+        }
+    )
+
+    # Example config
+    config = {
+        "sidecar_params": {
+            "func": {
+                "A": {"precision": 2},
+                "B": {"precision": 2},
+            },
+        },
+        "derived_params": {
+            "func": {},
+        },
+    }
+
+    # Expected DataFrame after rounding
+    expected_df = pd.DataFrame(
+        {
+            "A": [1.12, 2.23, 3.35],
+            "B": [[1.12, 2.23], [3.35, 4.46], [5.57, 6.68]],
+            "C": ["text", "more text", "even more text"],
+            "D": [1.12345, 2.23456, 3.34567],
+        }
+    )
+
+    # Round columns
+    rounded_df = utils.round_params(df, config, "func")
+
+    # Assert that the rounded DataFrame matches the expected DataFrame
+    pd.testing.assert_frame_equal(rounded_df, expected_df)
+
+
+def test_cluster_single_parameters():
+    """Test the cubids.utils.cluster_single_parameters function.
+
+    We want to test that the function correctly clusters parameters based on the
+    configuration dictionary.
+    """
+    config = {
+        "sidecar_params": {
+            "func": {
+                "RepetitionTime": {"tolerance": 0.01, "suggest_variant_rename": True},
+                "TaskName": {"suggest_variant_rename": True},
+                "SliceTiming": {"tolerance": 0.01, "suggest_variant_rename": True},
+                "ImageType": {"suggest_variant_rename": True},
+            },
+        },
+        "derived_params": {
+            "func": {},
+        },
+    }
+
+    # Mock up the input. The variants are explicitly prepared.
+    params = [
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes closed",
+            "SliceTiming": [0.0, 1.0, 2.0],
+            "ImageType": ["ORIGINAL", "NONE", "M"],
+        },
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes closed",
+            "SliceTiming": [0.0, 1.0, 2.0],
+            "ImageType": ["ORIGINAL", "NONE", "M"],
+        },
+        {
+            "RepetitionTime": 2.0,
+            # TaskName variant
+            "TaskName": "rest eyes open",
+            "SliceTiming": [0.0, 1.0, 2.0],
+            "ImageType": ["ORIGINAL", "NONE", "M"],
+        },
+        {
+            # RepetitionTime variant
+            "RepetitionTime": 1.9,
+            "TaskName": "rest eyes closed",
+            "SliceTiming": [0.0, 1.0, 2.0],
+            "ImageType": ["ORIGINAL", "NONE", "M"],
+        },
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes closed",
+            # SliceTiming variant (length)
+            "SliceTiming": [0.0, 0.5, 1.0, 1.5, 2.0],
+            "ImageType": ["ORIGINAL", "NONE", "M"],
+        },
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes closed",
+            # SliceTiming variant (values)
+            "SliceTiming": [0.0, 1.0, 1.9],
+            "ImageType": ["ORIGINAL", "NONE", "M"],
+        },
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes closed",
+            "SliceTiming": [0.0, 1.0, 2.0],
+            # ImageType variant (length)
+            "ImageType": ["ORIGINAL", "NONE", "M", "NORM"],
+        },
+        {
+            "RepetitionTime": 2.0,
+            "TaskName": "rest eyes closed",
+            "SliceTiming": [0.0, 1.0, 2.0],
+            # ImageType variant (values)
+            "ImageType": ["ORIGINAL", "NONE", "P"],
+        },
+    ]
+    files_df = pd.DataFrame(params)
+    modality = "func"
+
+    # Run the function
+    out_df = utils.cluster_single_parameters(
+        df=files_df,
+        config=config,
+        modality=modality,
+    )
+    assert isinstance(out_df, pd.DataFrame)
+    assert "Cluster_RepetitionTime" in out_df.columns
+    assert "Cluster_SliceTiming" in out_df.columns
+    assert "Cluster_ImageType" in out_df.columns
+    # Non-list columns without tolerance don't get clustered
+    assert "Cluster_TaskName" not in out_df.columns
+
+    assert compare_group_assignments(
+        out_df["Cluster_RepetitionTime"].values.astype(int),
+        [0, 0, 0, 1, 0, 0, 0, 0],
+    )
+    assert compare_group_assignments(
+        out_df["Cluster_SliceTiming"].values.astype(int),
+        [0, 0, 0, 0, 2, 1, 0, 0],
+    )
+    assert compare_group_assignments(
+        out_df["Cluster_ImageType"].values.astype(int),
+        [0, 0, 0, 0, 0, 0, 1, 2],
+    )
+
+    # Change the tolerance for SliceTiming
+    config["sidecar_params"]["func"]["SliceTiming"]["tolerance"] = 0.5
+    out_df = utils.cluster_single_parameters(
+        df=files_df,
+        config=config,
+        modality=modality,
+    )
+    assert isinstance(out_df, pd.DataFrame)
+    assert "Cluster_RepetitionTime" in out_df.columns
+    assert "Cluster_SliceTiming" in out_df.columns
+    assert "Cluster_ImageType" in out_df.columns
+    # Non-list columns without tolerance don't get clustered
+    assert "Cluster_TaskName" not in out_df.columns
+
+    assert compare_group_assignments(
+        out_df["Cluster_RepetitionTime"].values.astype(int),
+        [0, 0, 0, 1, 0, 0, 0, 0],
+    )
+    # Different lengths still produce different clusters,
+    # but the value-based variants are now the same
+    assert compare_group_assignments(
+        out_df["Cluster_SliceTiming"].values.astype(int),
+        [0, 0, 0, 0, 1, 0, 0, 0],
+    )
+    assert compare_group_assignments(
+        out_df["Cluster_ImageType"].values.astype(int),
+        [0, 0, 0, 0, 0, 0, 1, 2],
+    )
diff --git a/cubids/tests/utils.py b/cubids/tests/utils.py
@@ -167,3 +167,56 @@ def chdir(path):
         yield
     finally:
         os.chdir(oldpwd)
+
+
+def compare_group_assignments(list1, list2):
+    """Compare two lists for equality based on group assignments.
+
+    This function checks if two lists can be considered equal based on their group assignments.
+    The actual values in the lists do not matter, only the group assignments do. Each unique value
+    in the first list is mapped to a unique value in the second list, and the function checks if
+    this mapping is consistent throughout the lists.
+
+    Parameters
+    ----------
+    list1 : list
+        The first list to compare.
+    list2 : list
+        The second list to compare.
+
+    Returns
+    -------
+    bool
+        True if the lists are equal based on group assignments, False otherwise.
+
+    Examples
+    --------
+    >>> list1 = [1, 2, 1, 3, 2]
+    >>> list2 = ['a', 'b', 'a', 'c', 'b']
+    >>> compare_group_assignments(list1, list2)
+    True
+
+    >>> list1 = [1, 2, 1, 3, 2]
+    >>> list2 = ['b', 'd', 'b', 'q', 'd']
+    >>> compare_group_assignments(list1, list2)
+    True
+
+    >>> list1 = [1, 2, 1, 3, 2]
+    >>> list2 = ['a', 'b', 'a', 'c', 'd']
+    >>> compare_group_assignments(list1, list2)
+    False
+    """
+    if len(list1) != len(list2):
+        return False
+
+    mapping = {}
+    for a, b in zip(list1, list2):
+        if a in mapping:
+            if mapping[a] != b:
+                return False
+        else:
+            if b in mapping.values():
+                return False
+            mapping[a] = b
+
+    return True
diff --git a/cubids/utils.py b/cubids/utils.py