[JTH] little modifications in kma reg guided albas implementation

tausiaj · tausiaj · commit 85fa4f6de15c · 2025-05-14T13:31:59.000+02:00
diff --git a/bluemath_tk/core/decorators.py b/bluemath_tk/core/decorators.py
@@ -130,28 +130,42 @@ def wrapper(
         min_number_of_points: int = None,
         max_number_of_iterations: int = 10,
         normalize_data: bool = False,
-        regression_guided: Dict[str, Dict[str, Any]] = {},
+        regression_guided: Dict[str, List] = {},
     ):
         if data is None:
-            raise ValueError("Data cannot be None")
+            raise ValueError("data cannot be None")
         elif not isinstance(data, pd.DataFrame):
-            raise TypeError("Data must be a pandas DataFrame")
+            raise TypeError("data must be a pandas DataFrame")
         if not isinstance(directional_variables, list):
-            raise TypeError("Directional variables must be a list")
+            raise TypeError("directional_variables must be a list")
         if not isinstance(custom_scale_factor, dict):
-            raise TypeError("Custom scale factor must be a dict")
+            raise TypeError("custom_scale_factor must be a dict")
         if min_number_of_points is not None:
             if not isinstance(min_number_of_points, int) or min_number_of_points <= 0:
-                raise ValueError("Minimum number of points must be integer and > 0")
+                raise ValueError("min_number_of_points must be integer and > 0")
         if (
             not isinstance(max_number_of_iterations, int)
             or max_number_of_iterations <= 0
         ):
-            raise ValueError("Maximum number of iterations must be integer and > 0")
+            raise ValueError("max_number_of_iterations must be integer and > 0")
         if not isinstance(normalize_data, bool):
-            raise TypeError("Normalize data must be a boolean")
+            raise TypeError("normalize_data must be a boolean")
         if not isinstance(regression_guided, dict):
             raise TypeError("regression_guided must be a dictionary")
+        if not all(
+            isinstance(var, str) and var in data.columns
+            for var in regression_guided.get("vars", [])
+        ):
+            raise TypeError(
+                "regression_guided vars must be a list of strings and must exist in data"
+            )
+        if not all(
+            isinstance(alpha, float) and alpha >= 0 and alpha <= 1
+            for alpha in regression_guided.get("alpha", [])
+        ):
+            raise TypeError(
+                "regression_guided alpha must be a list of floats between 0 and 1"
+            )
         return func(
             self,
             data,
@@ -160,7 +174,7 @@ def wrapper(
             min_number_of_points,
             max_number_of_iterations,
             normalize_data,
-            regression_guided
+            regression_guided,
         )
 
     return wrapper
@@ -388,7 +402,6 @@ def wrapper(
         self,
         data: xr.Dataset,
         fit_params: Dict[str, Dict[str, Any]] = {},
-        regression_guided: Dict[str, Dict[str, Any]] = {},
         variable_to_sort_bmus: str = None,
     ):
         if not isinstance(data, xr.Dataset):
diff --git a/bluemath_tk/datamining/kma.py b/bluemath_tk/datamining/kma.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple, Any, Dict
+from typing import Dict, List, Tuple
 
 import numpy as np
 import pandas as pd
@@ -178,17 +178,32 @@ def data_to_fit(self) -> pd.DataFrame:
 
         return self._data_to_fit
 
+    @staticmethod
+    def add_regression_guided(
+        data: pd.DataFrame, vars: List[str], alpha: List[float]
+    ) -> pd.DataFrame:
+        """
+        Calculate regression-guided variables.
 
-    def add_regression_guided(self, data: pd.DataFrame, vars: List[str], alpha: List[float]) -> pd.DataFrame:
+        Parameters
+        ----------
+        data : pd.DataFrame
+            The data to fit the K-Means algorithm.
+        vars : List[str]
+            The variables to use for regression-guided clustering.
+        alpha : List[float]
+            The alpha values to use for regression-guided clustering.
 
-        """
-        Help KMA clustering features with regression-guided variables.
+        Returns
+        -------
+        pd.DataFrame
+            The data with the regression-guided variables.
         """
 
         # Stack guiding variables into (time, n_vars) array
         X = data.drop(columns=vars)
         Y = np.stack([data[var].values for var in vars], axis=1)
-        
+
         # Normalize input features
         X_std = X.std().replace(0, 1)
         X_norm = X / X_std
@@ -223,7 +238,7 @@ def fit(
         min_number_of_points: int = None,
         max_number_of_iterations: int = 10,
         normalize_data: bool = False,
-        regression_guided: Dict[str, Dict[str, Any]] = {},
+        regression_guided: Dict[str, List] = {},
     ) -> None:
         """
         Fit the K-Means algorithm to the provided data.
@@ -232,8 +247,7 @@ def fit(
         provided dataframe and custom scale factor.
         It normalizes the data, and returns the calculated centroids.
 
-        TODO: Implement KMA regression guided with variable.
-              Add option to force KMA initialization with MDA centroids.
+        TODO: Add option to force KMA initialization with MDA centroids.
 
         Parameters
         ----------
@@ -256,22 +270,23 @@ def fit(
             A flag to normalize the data. Default is False.
         regression_guided: dict, optional
             A dictionary specifying regression-guided clustering variables and relative weights.
+            Example: {"vars":["Fe"],"alpha":[0.6]}. Default is {}.
         """
-        
+
         if regression_guided:
             data = self.add_regression_guided(
-                data=data, 
-                vars = regression_guided.get("vars", None),
-                alpha = regression_guided.get("alpha", None)
+                data=data,
+                vars=regression_guided.get("vars", None),
+                alpha=regression_guided.get("alpha", None),
             )
-        
+
         super().fit(
             data=data,
             directional_variables=directional_variables,
             custom_scale_factor=custom_scale_factor,
             normalize_data=normalize_data,
         )
-        
+
         # Fit K-Means algorithm
         if min_number_of_points is not None:
             stable_kma_child = False
@@ -303,7 +318,7 @@ def fit(
         self.centroids = self.denormalize(
             normalized_data=self.normalized_centroids, scale_factor=self.scale_factor
         )
-        
+
         for directional_variable in self.directional_variables:
             self.centroids[directional_variable] = self.get_degrees_from_uv(
                 xu=self.centroids[f"{directional_variable}_u"].values,
@@ -348,7 +363,7 @@ def fit_predict(
         min_number_of_points: int = None,
         max_number_of_iterations: int = 10,
         normalize_data: bool = False,
-        regression_guided: Dict[str, Dict[str, Any]] = {},
+        regression_guided: Dict[str, List] = {},
     ) -> Tuple[pd.DataFrame, pd.DataFrame]:
         """
         Fit the K-Means algorithm to the provided data and predict the nearest centroid
@@ -373,22 +388,25 @@ def fit_predict(
             Default is 10.
         normalize_data : bool, optional
             A flag to normalize the data. Default is False.
+        regression_guided: dict, optional
+            A dictionary specifying regression-guided clustering variables and relative weights.
+            Example: {"vars":["Fe"],"alpha":[0.6]}. Default is {}.
 
         Returns
         -------
         Tuple[pd.DataFrame, pd.DataFrame]
             A tuple containing the nearest centroid index for each data point,
             and the nearest centroids.
         """
-        
+
         self.fit(
             data=data,
             directional_variables=directional_variables,
             custom_scale_factor=custom_scale_factor,
             min_number_of_points=min_number_of_points,
             max_number_of_iterations=max_number_of_iterations,
             normalize_data=normalize_data,
-            regression_guided=regression_guided
+            regression_guided=regression_guided,
         )
 
         return self.predict(data=data)
diff --git a/bluemath_tk/predictor/xwt.py b/bluemath_tk/predictor/xwt.py
@@ -1,5 +1,3 @@
-from typing import List
-
 import logging
 import warnings
 from datetime import datetime, timedelta
@@ -303,7 +301,7 @@ def get_conditioned_probabilities(self) -> pd.DataFrame:
         )
 
         return df_cond_probs
-    
+
     @validate_data_xwt
     def fit(
         self,
@@ -327,6 +325,9 @@ def fit(
         ------
         XWTError
             If the data is not PCA formatted.
+
+        TODO: Standarize PCs by first PC variance.
+              pca.pcs_df / pca.pcs.stds.isel(n_component=0).values ??
         """
 
         # Make a copy of the data to avoid modifying the original dataset
@@ -346,17 +347,16 @@ def fit(
 
         kma: KMA = self.steps.get("kma")
         self.num_clusters = kma.num_clusters
-        # TODO: standarize PCs by first PC variance
-        
-        data_to_kma = pca.pcs_df
-        
+
+        data_to_kma = pca.pcs_df.copy()
+
         if "regression_guided" in fit_params.get("kma", {}):
             guiding_vars = fit_params["kma"]["regression_guided"].get("vars", [])
-            
-            if guiding_vars:    
+
+            if guiding_vars:
                 guiding_data = pd.DataFrame(
                     {var: data[var].values for var in guiding_vars},
-                    index=data.time.values
+                    index=data.time.values,
                 )
                 data_to_kma = pd.concat([data_to_kma, guiding_data], axis=1)
 
diff --git a/tests/datamining/test_kma.py b/tests/datamining/test_kma.py
@@ -1,6 +1,8 @@
 import unittest
+
 import numpy as np
 import pandas as pd
+
 from bluemath_tk.datamining.kma import KMA
 
 
@@ -46,6 +48,19 @@ def test_fit_predict(self):
         self.assertIsInstance(predicted_labels_df, pd.DataFrame)
         self.assertEqual(predicted_labels_df.shape[0], 1000)
 
+    def test_add_regression_guided(self):
+        data = self.df.copy()
+        data["Fe"] = data["Hs"] ** 2 * data["Tp"]
+        predicted_labels, predicted_labels_df = self.kma.fit_predict(
+            data=data,
+            directional_variables=["Dir"],
+            regression_guided={"vars": ["Fe"], "alpha": [0.6]},
+        )
+        self.assertIsInstance(predicted_labels, pd.DataFrame)
+        self.assertEqual(len(predicted_labels), 1000)
+        self.assertIsInstance(predicted_labels_df, pd.DataFrame)
+        self.assertEqual(predicted_labels_df.shape[0], 1000)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/interpolation/test_rbf.py b/tests/interpolation/test_rbf.py
@@ -10,9 +10,9 @@ class TestRBF(unittest.TestCase):
     def setUp(self):
         self.dataset = pd.DataFrame(
             {
-                "Hs": np.random.rand(1000) * 7,
-                "Tp": np.random.rand(1000) * 20,
-                "Dir": np.random.rand(1000) * 360,
+                "Hs": np.random.rand(100) * 7,
+                "Tp": np.random.rand(100) * 20,
+                "Dir": np.random.rand(100) * 360,
             }
         )
         self.subset = self.dataset.sample(frac=0.25)

Original file line number	Diff line number	Diff line change
`@@ -10,9 +10,9 @@ class TestRBF(unittest.TestCase):`
`10`	`10`	`def setUp(self):`
`11`	`11`	`self.dataset = pd.DataFrame(`
`12`	`12`	`{`
`13`		`- "Hs": np.random.rand(1000) * 7,`
`14`		`- "Tp": np.random.rand(1000) * 20,`
`15`		`- "Dir": np.random.rand(1000) * 360,`
	`13`	`+ "Hs": np.random.rand(100) * 7,`
	`14`	`+ "Tp": np.random.rand(100) * 20,`
	`15`	`+ "Dir": np.random.rand(100) * 360,`
`16`	`16`	`}`
`17`	`17`	`)`
`18`	`18`	`self.subset = self.dataset.sample(frac=0.25)`