[JTH] generalise kma test

tausiaj · tausiaj · commit 2383b9184c8e · 2025-03-06T13:17:24.000+01:00
diff --git a/bluemath_tk/datamining/kma.py b/bluemath_tk/datamining/kma.py
@@ -284,7 +284,8 @@ def predict(self, data: pd.DataFrame) -> Tuple[np.ndarray, pd.DataFrame]:
         Returns
         -------
         Tuple[np.ndarray, pd.DataFrame]
-            A tuple containing the nearest centroid index for each data point and the nearest centroids.
+            A tuple containing the nearest centroid index for each data point,
+            and the nearest centroids.
         """
 
         if self.is_fitted is False:
@@ -310,9 +311,11 @@ def fit_predict(
         data: pd.DataFrame,
         directional_variables: List[str] = [],
         custom_scale_factor: dict = {},
+        min_number_of_points: int = None,
     ) -> Tuple[np.ndarray, pd.DataFrame]:
         """
-        Fit the K-Means algorithm to the provided data and predict the nearest centroid for each data point.
+        Fit the K-Means algorithm to the provided data and predict the nearest centroid
+        for each data point.
 
         Parameters
         ----------
@@ -324,17 +327,22 @@ def fit_predict(
         custom_scale_factor : dict
             A dictionary specifying custom scale factors for normalization.
             Default is {}.
+        min_number_of_points : int, optional
+            The minimum number of points to consider a cluster.
+            Default is None.
 
         Returns
         -------
-        Tuple[pd.DataFrame, np.ndarray, pd.DataFrame]
-            A tuple containing the nearest centroid index for each data point, and the nearest centroids.
+        Tuple[np.ndarray, pd.DataFrame]
+            A tuple containing the nearest centroid index for each data point,
+            and the nearest centroids.
         """
 
         self.fit(
             data=data,
             directional_variables=directional_variables,
             custom_scale_factor=custom_scale_factor,
+            min_number_of_points=min_number_of_points,
         )
 
         return self.predict(data=data)
diff --git a/tests/datamining/test_kma.py b/tests/datamining/test_kma.py
@@ -16,7 +16,7 @@ def setUp(self):
         self.kma = KMA(num_clusters=10)
 
     def test_fit(self):
-        self.kma.fit(data=self.df, min_number_of_points=80)
+        self.kma.fit(data=self.df, min_number_of_points=50)
         self.assertIsInstance(self.kma.centroids, pd.DataFrame)
         self.assertEqual(self.kma.centroids.shape[0], 10)
 
@@ -36,11 +36,15 @@ def test_predict(self):
         self.assertEqual(nearest_centroid_df.shape[0], 15)
 
     def test_fit_predict(self):
-        nearest_centroids, nearest_centroid_df = self.kma.fit_predict(data=self.df)
-        self.assertIsInstance(nearest_centroids, np.ndarray)
-        self.assertEqual(len(nearest_centroids), 1000)
-        self.assertIsInstance(nearest_centroid_df, pd.DataFrame)
-        self.assertEqual(nearest_centroid_df.shape[0], 1000)
+        predicted_labels, predicted_labels_df = self.kma.fit_predict(
+            data=self.df, min_number_of_points=50
+        )
+        _unique_labels, counts = np.unique(predicted_labels, return_counts=True)
+        self.assertTrue(np.all(counts >= 50))
+        self.assertIsInstance(predicted_labels, np.ndarray)
+        self.assertEqual(len(predicted_labels), 1000)
+        self.assertIsInstance(predicted_labels_df, pd.DataFrame)
+        self.assertEqual(predicted_labels_df.shape[0], 1000)
 
 
 if __name__ == "__main__":