[JTH] add pca and rbf optimized wit threads and working

tausiaj · tausiaj · commit ad7e097c05dc · 2025-02-06T10:27:07.000+01:00
diff --git a/bluemath_tk/core/decorators.py b/bluemath_tk/core/decorators.py
@@ -226,18 +226,11 @@ def wrapper(
                 "PCA dimension for rows must be a string and found in the data dimensions"
             )
         for variable, windows in windows_in_pca_dim_for_rows.items():
-            if variable not in vars_to_stack:
-                raise ValueError(f"Variable {variable} not found in vars_to_stack")
             if not isinstance(windows, list):
                 raise TypeError("Windows must be a list")
             if not all([isinstance(window, int) and window > 0 for window in windows]):
                 raise ValueError("Windows must be a list of integers > 0")
-        for variable, _ in value_to_replace_nans.items():
-            if variable not in vars_to_stack:
-                raise ValueError(f"Variable {variable} not found in vars_to_stack")
         for variable, threshold in nan_threshold_to_drop.items():
-            if variable not in vars_to_stack:
-                raise ValueError(f"Variable {variable} not found in vars_to_stack")
             if not isinstance(threshold, float) or threshold < 0 or threshold > 1:
                 raise ValueError("Threshold must be a float between 0 and 1")
         return func(
diff --git a/bluemath_tk/datamining/pca.py b/bluemath_tk/datamining/pca.py
@@ -34,18 +34,20 @@ class PCA(BaseReduction):
         The PCA or Incremental PCA model.
     is_fitted : bool
         Indicates whether the PCA model has been fitted.
-    _data : xr.Dataset
+    data : xr.Dataset
         The original dataset.
-    _postprocessed_data : xr.Dataset
-        The postprocessed dataset.
-    _stacked_data_matrix : np.ndarray
+    window_processed_data : xr.Dataset
+        The windows processed dataset.
+    stacked_data_matrix : np.ndarray
         The stacked data matrix.
-    _standarized_stacked_data_matrix : np.ndarray
+    standarized_stacked_data_matrix : np.ndarray
         The standardized stacked data matrix.
     scaler : StandardScaler
         The scaler used for standardizing the data.
     vars_to_stack : List[str]
         The list of variables to stack.
+    window_stacked_vars : List[str]
+        The list of variables with windows.
     coords_to_stack : List[str]
         The list of coordinates to stack.
     pca_dim_for_rows : str
@@ -93,12 +95,19 @@ class PCA(BaseReduction):
     >>> from bluemath_tk.core.data.sample_data import get_2d_dataset
     >>> from bluemath_tk.datamining.pca import PCA
     >>> ds = get_2d_dataset()
-    >>> pca = PCA(n_components=5)
+    >>> pca = PCA(
+    ...     n_components=5,
+    ...     is_incremental=False,
+    ...     debug=True,
+    ... )
     >>> pca.fit(
     ...     data=ds,
     ...     vars_to_stack=["X", "Y"],
     ...     coords_to_stack=["coord1", "coord2"],
     ...     pca_dim_for_rows="coord3",
+    ...     windows_in_pca_dim_for_rows={"X": [1, 2, 3]},
+    ...     value_to_replace_nans={"X": 0.0},
+    ...     nan_threshold_to_drop={"X": 0.95},
     ... )
     >>> pcs = pca.transform(
     ...     data=ds,
@@ -108,6 +117,8 @@ class PCA(BaseReduction):
     >>> explained_variance = pca.explained_variance
     >>> explained_variance_ratio = pca.explained_variance_ratio
     >>> cumulative_explained_variance_ratio = pca.cumulative_explained_variance_ratio
+    >>> # Save the full class in a pickle file
+    >>> pca.save_model("pca_model.pkl")
 
     References
     ----------
@@ -260,18 +271,31 @@ def _generate_stacked_data(self, data: xr.Dataset) -> np.ndarray:
         cleaned_vars_to_stack = []
         for var_to_clean in self.window_stacked_vars:
             var_to_clean_values = tmp_stacked_data[var_to_clean].values
+            # Drop variables with more than 90% of NaNs if not specified
+            var_to_clean_threshold = self.nan_threshold_to_drop.get(
+                var_to_clean,
+                self.nan_threshold_to_drop.get(
+                    var_to_clean[:-2],
+                    self.nan_threshold_to_drop.get(var_to_clean[:-3], 0.90),
+                ),
+            )
             not_nan_positions = np.where(
-                np.mean(np.isnan(var_to_clean_values), axis=0)
-                < self.nan_threshold_to_drop.get(
-                    var_to_clean, 0.05
-                )  # TODO: Add to docstring
+                np.mean(~np.isnan(var_to_clean_values), axis=0) > var_to_clean_threshold
             )[0]
+            # Replace NaNs with the value specified in value_to_replace_nans
+            # If not specified, try to get the value from the variable name, deleting window suffixes
+            var_value_to_replace_nans = self.value_to_replace_nans.get(
+                var_to_clean,
+                self.value_to_replace_nans.get(
+                    var_to_clean[:-2], self.value_to_replace_nans.get(var_to_clean[:-3])
+                ),
+            )
             self.logger.debug(
-                f"Replacing NaNs for variable: {var_to_clean} with value: {self.value_to_replace_nans.get(var_to_clean)}"
+                f"Replacing NaNs for variable: {var_to_clean} with value: {var_value_to_replace_nans}"
             )
             cleaned_var = self.check_nans(
                 data=var_to_clean_values[:, not_nan_positions],
-                replace_value=self.value_to_replace_nans.get(var_to_clean),
+                replace_value=var_value_to_replace_nans,
             )
             cleaned_vars_to_stack.append(cleaned_var)
             self.not_nan_positions[var_to_clean] = not_nan_positions
@@ -496,8 +520,15 @@ def fit(
             The value to replace NaNs for each variable. Default is {}.
         nan_threshold_to_drop : dict, optional
             The threshold percentage to drop NaNs for each variable.
-            By default, variables with more than 95% of NaNs are dropped.
+            By default, variables with more than 90% of NaNs are dropped.
             Default is {}.
+
+        Notes
+        -----
+        For both value_to_replace_nans and nan_threshold_to_drop, the keys are the variables,
+        and the suffixes for the windows are considered.
+        Example: if you have variable "X", and apply windows [1, 2, 3], you can use "X_1", "X_2", "X_3".
+        Nevertheless, you can also use the original variable name "X" to apply the same value for all windows.
         """
 
         self.vars_to_stack = vars_to_stack.copy()
@@ -585,13 +616,20 @@ def fit_transform(
             The value to replace NaNs for each variable. Default is {}.
         nan_threshold_to_drop : dict, optional
             The threshold percentage to drop NaNs for each variable.
-            By default, variables with more than 95% of NaNs are dropped.
+            By default, variables with more than 90% of NaNs are dropped.
             Default is {}.
 
         Returns
         -------
         xr.Dataset
-            The transformed data.
+            The transformed data representing the Principal Components (PCs).
+
+        Notes
+        -----
+        For both value_to_replace_nans and nan_threshold_to_drop, the keys are the variables,
+        and the suffixes for the windows are considered.
+        Example: if you have variable "X", and apply windows [1, 2, 3], you can use "X_1", "X_2", "X_3".
+        Nevertheless, you can also use the original variable name "X" to apply the same value for all windows.
         """
 
         self.fit(
@@ -626,6 +664,8 @@ def inverse_transform(self, PCs: Union[np.ndarray, xr.Dataset]) -> xr.Dataset:
 
         if isinstance(PCs, xr.Dataset):
             X = PCs["PCs"].values
+        elif isinstance(PCs, xr.DataArray):
+            X = PCs.values
         elif isinstance(PCs, np.ndarray):
             X = PCs
 
diff --git a/bluemath_tk/interpolation/rbf.py b/bluemath_tk/interpolation/rbf.py
@@ -669,14 +669,63 @@ def _calc_opt_sigma(
 
         return rbf_coeff, opt_sigma
 
-    def _rbf_interpolate(self, dataset: pd.DataFrame) -> pd.DataFrame:
+    def _rbf_variable_interpolation(
+        self,
+        normalized_dataset: pd.DataFrame,
+        opt_sigma: float,
+        rbf_coeff: np.ndarray,
+        num_points_subset: int,
+        num_vars_subset: int,
+    ) -> np.ndarray:
+        """
+        Interpolates the surface for a variable.
+
+        normalized_dataset : pd.DataFrame
+            The normalized dataset.
+        opt_sigma : float
+            The optimal sigma calculated for variable.
+        rbf_coeff : np.ndarray
+            The fitted coefficients for variable.
+        num_points_subset : int
+            The number of points used in the fitting.
+        num_vars_subset : int
+            The number of variables used in the fitting.
+
+        np.ndarray
+            The interpolated variable.
+        """
+
+        r = np.linalg.norm(
+            normalized_dataset.values[:, np.newaxis, :]
+            - self.normalized_subset_data.values[np.newaxis, :, :],
+            axis=2,
+        )
+        kernel_values = self.kernel_func(r, opt_sigma)
+        linear_part = np.dot(
+            normalized_dataset.values,
+            rbf_coeff[
+                num_points_subset + 1 : num_points_subset + 1 + num_vars_subset
+            ].T,
+        )
+
+        return (
+            rbf_coeff[num_points_subset]
+            + np.dot(kernel_values, rbf_coeff[:num_points_subset])
+            + linear_part
+        )
+
+    def _rbf_interpolate(
+        self, dataset: pd.DataFrame, num_threads: int = None
+    ) -> pd.DataFrame:
         """
         This function interpolates the dataset.
 
         Parameters
         ----------
         dataset : pd.DataFrame
             The dataset to interpolate (must have same variables as subset).
+        num_threads : int, optional
+            The number of threads to use for the interpolation. Default is None.
 
         Returns
         -------
@@ -698,28 +747,42 @@ def _rbf_interpolate(self, dataset: pd.DataFrame) -> pd.DataFrame:
         )
 
         # Loop through the target variables
-        for i_var, target_var in enumerate(self.target_processed_variables):
-            self.logger.info(f"Interpolating target variable {target_var}")
-            rbf_coeff = self._rbf_coeffs[target_var].values
-            opt_sigma = self._opt_sigmas[target_var]
-            r = np.linalg.norm(
-                normalized_dataset.values[:, np.newaxis, :]
-                - self.normalized_subset_data.values[np.newaxis, :, :],
-                axis=2,
-            )
-            kernel_values = self.kernel_func(r, opt_sigma)
-            linear_part = np.dot(
-                normalized_dataset.values,
-                rbf_coeff[
-                    num_points_subset + 1 : num_points_subset + 1 + num_vars_subset
-                ].T,
-            )
-            s = (
-                rbf_coeff[num_points_subset]
-                + np.dot(kernel_values, rbf_coeff[:num_points_subset])
-                + linear_part
-            )
-            interpolated_array[:, i_var] = s
+        if num_threads is not None:
+            # self.set_num_processors_to_use(num_processors=num_threads)
+            num_threads = min(num_threads, self.get_num_processors_available())
+            self.logger.info(f"Using {num_threads} threads for interpolation.")
+            with ThreadPoolExecutor(max_workers=num_threads) as executor:
+                rbf_variable_calculation = {
+                    executor.submit(
+                        self._rbf_variable_interpolation,
+                        normalized_dataset,
+                        self._opt_sigmas[target_var],
+                        self._rbf_coeffs[target_var].values,
+                        num_points_subset,
+                        num_vars_subset,
+                    ): (i_var, target_var)
+                    for i_var, target_var in enumerate(self.target_processed_variables)
+                }
+                for future in as_completed(rbf_variable_calculation):
+                    i_rbf_var, rbf_variable = rbf_variable_calculation[future]
+                    try:
+                        interpolated_var = future.result()
+                        interpolated_array[:, i_rbf_var] = interpolated_array
+                    except Exception as exc:
+                        self.logger.error(
+                            f"Job for {rbf_variable} generated an exception: {exc}."
+                        )
+        else:
+            for i_var, target_var in enumerate(self.target_processed_variables):
+                self.logger.info(f"Interpolating target variable {target_var}")
+                interpolated_var = self._rbf_variable_interpolation(
+                    normalized_dataset=normalized_dataset,
+                    opt_sigma=self._opt_sigmas[target_var],
+                    rbf_coeff=self._rbf_coeffs[target_var].values,
+                    num_points_subset=num_points_subset,
+                    num_vars_subset=num_vars_subset,
+                )
+                interpolated_array[:, i_var] = interpolated_var
 
         return pd.DataFrame(interpolated_array, columns=self.target_processed_variables)
 
@@ -826,14 +889,16 @@ def fit(
         # Set the is_fitted attribute to True
         self.is_fitted = True
 
-    def predict(self, dataset: pd.DataFrame) -> pd.DataFrame:
+    def predict(self, dataset: pd.DataFrame, num_threads: int = None) -> pd.DataFrame:
         """
         Predicts the data for the provided dataset.
 
         Parameters
         ----------
         dataset : pd.DataFrame
             The dataset to predict (must have same variables than subset).
+        num_threads : int, optional
+            The number of threads to use for the interpolation. Default is None.
 
         Returns
         -------
@@ -857,7 +922,9 @@ def predict(self, dataset: pd.DataFrame) -> pd.DataFrame:
             raise RBFError("RBF model must be fitted before predicting.")
 
         self.logger.info("Reconstructing data using fitted coefficients.")
-        interpolated_target = self._rbf_interpolate(dataset=dataset)
+        interpolated_target = self._rbf_interpolate(
+            dataset=dataset, num_threads=num_threads
+        )
         if self.is_target_normalized:
             self.logger.info("Denormalizing target data")
             interpolated_target = self.denormalize(
@@ -870,6 +937,7 @@ def predict(self, dataset: pd.DataFrame) -> pd.DataFrame:
                 xu=interpolated_target[f"{directional_variable}_u"].values,
                 xv=interpolated_target[f"{directional_variable}_v"].values,
             )
+
         return interpolated_target
 
     def fit_predict(
@@ -933,4 +1001,4 @@ def fit_predict(
             iteratively_update_sigma=iteratively_update_sigma,
         )
 
-        return self.predict(dataset=dataset)
+        return self.predict(dataset=dataset, num_threads=num_threads)
diff --git a/tests/datamining/test_pca.py b/tests/datamining/test_pca.py
@@ -6,38 +6,23 @@
 class TestPCA(unittest.TestCase):
     def setUp(self):
         self.ds = get_2d_dataset()
-        self.pca = PCA(n_components=5)
+        self.pca = PCA(n_components=5, debug=True)
         self.ipca = PCA(n_components=5, is_incremental=True)
 
-    def test_fit(self):
-        self.pca.fit(
-            data=self.ds,
-            vars_to_stack=["X", "Y"],
-            coords_to_stack=["coord1", "coord2"],
-            pca_dim_for_rows="coord3",
-        )
-        self.assertEqual(self.pca.is_fitted, True)
-
-    def test_transform(self):
-        self.pca.fit(
-            data=self.ds,
-            vars_to_stack=["X", "Y"],
-            coords_to_stack=["coord1", "coord2"],
-            pca_dim_for_rows="coord3",
-        )
-        pcs = self.pca.transform(
-            data=self.ds,
-        )
-        self.assertEqual(pcs.PCs.shape[1], 5)
-
     def test_fit_transform(self):
         pcs = self.pca.fit_transform(
             data=self.ds,
             vars_to_stack=["X", "Y"],
             coords_to_stack=["coord1", "coord2"],
             pca_dim_for_rows="coord3",
+            windows_in_pca_dim_for_rows={"X": [3], "Y": [1]},
+            value_to_replace_nans={"X": 0.0, "X_3": 1.0, "Y": 0.0},
+            nan_threshold_to_drop={"X": 0.5, "Y": 0.5},
         )
+        self.assertEqual(self.pca.is_fitted, True)
         self.assertEqual(pcs.PCs.shape[1], 5)
+        self.assertEqual(pcs.PCs.shape[0], self.ds.sizes["coord3"])
+        self.assertCountEqual(self.pca.eofs.data_vars, ["X", "X_3", "Y", "Y_1"])
 
     def test_inverse_transform(self):
         pcs = self.pca.fit_transform(
diff --git a/tests/interpolation/test_rbf.py b/tests/interpolation/test_rbf.py
@@ -29,6 +29,7 @@ def test_fit(self):
             target_data=self.target,
             target_directional_variables=["DirPred"],
             normalize_target_data=True,
+            num_threads=4,
         )
         self.assertTrue(self.rbf.is_fitted)
         self.assertTrue(self.rbf.is_target_normalized)

Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@ def test_fit(self):`
`29`	`29`	`target_data=self.target,`
`30`	`30`	`target_directional_variables=["DirPred"],`
`31`	`31`	`normalize_target_data=True,`
	`32`	`+ num_threads=4,`
`32`	`33`	`)`
`33`	`34`	`self.assertTrue(self.rbf.is_fitted)`
`34`	`35`	`self.assertTrue(self.rbf.is_target_normalized)`