chore: initial refactoring of incremental spmd algos (#2248)

ethanglaser · icfaust · web-flow · commit 2159d03b2cab · 2025-02-17T14:49:54.000-08:00
* Refactor incremental spmd algos

* Clear spmd impls, specify non-spmd get_policy in base cls

* black

* minor bs fix

* apply changes to PCA predict and add transform

* add comments

* tuple indices safeguarding

* incremental bs fit fixes

* restore previous 2, added to raw inputs instead

* Update onedal/decomposition/pca.py

Co-authored-by: Ian Faust &lt;icfaust@gmail.com&gt;

* blacked

---------

Co-authored-by: Ian Faust &lt;icfaust@gmail.com&gt;
diff --git a/onedal/basic_statistics/incremental_basic_statistics.py b/onedal/basic_statistics/incremental_basic_statistics.py
@@ -71,8 +71,9 @@ def __init__(self, result_options="all"):
 
     def _reset(self):
         self._need_to_finalize = False
-        self._partial_result = self._get_backend(
-            "basic_statistics", None, "partial_compute_result"
+        # Not supported with spmd policy so IncrementalBasicStatistics must be specified
+        self._partial_result = IncrementalBasicStatistics._get_backend(
+            IncrementalBasicStatistics, "basic_statistics", None, "partial_compute_result"
         )
 
     def __getstate__(self):
@@ -105,7 +106,10 @@ def partial_fit(self, X, weights=None, queue=None):
             Returns the instance itself.
         """
         self._queue = queue
-        policy = self._get_policy(queue, X)
+        # Not supported with spmd policy so IncrementalBasicStatistics must be specified
+        policy = IncrementalBasicStatistics._get_policy(
+            IncrementalBasicStatistics, queue, X
+        )
 
         X = _check_array(
             X, dtype=[np.float64, np.float32], ensure_2d=False, force_all_finite=False
@@ -123,7 +127,9 @@ def partial_fit(self, X, weights=None, queue=None):
             self._onedal_params = self._get_onedal_params(False, dtype=dtype)
 
         X_table, weights_table = to_table(X, weights, queue=queue)
-        self._partial_result = self._get_backend(
+        # Not supported with spmd policy so IncrementalBasicStatistics must be specified
+        self._partial_result = IncrementalBasicStatistics._get_backend(
+            IncrementalBasicStatistics,
             "basic_statistics",
             None,
             "partial_compute",
diff --git a/onedal/covariance/incremental_covariance.py b/onedal/covariance/incremental_covariance.py
@@ -58,8 +58,9 @@ def __init__(self, method="dense", bias=False, assume_centered=False):
 
     def _reset(self):
         self._need_to_finalize = False
-        self._partial_result = self._get_backend(
-            "covariance", None, "partial_compute_result"
+        # Not supported with spmd policy so IncrementalEmpiricalCovariance must be specified
+        self._partial_result = IncrementalEmpiricalCovariance._get_backend(
+            IncrementalEmpiricalCovariance, "covariance", None, "partial_compute_result"
         )
 
     def __getstate__(self):
@@ -99,15 +100,20 @@ def partial_fit(self, X, y=None, queue=None):
 
         self._queue = queue
 
-        policy = self._get_policy(queue, X)
+        # Not supported with spmd policy so IncrementalEmpiricalCovariance must be specified
+        policy = IncrementalEmpiricalCovariance._get_policy(
+            IncrementalEmpiricalCovariance, queue, X
+        )
 
         X_table = to_table(X, queue=queue)
 
         if not hasattr(self, "_dtype"):
             self._dtype = X_table.dtype
 
         params = self._get_onedal_params(self._dtype)
-        self._partial_result = self._get_backend(
+        # Not supported with spmd policy so IncrementalEmpiricalCovariance must be specified
+        self._partial_result = IncrementalEmpiricalCovariance._get_backend(
+            IncrementalEmpiricalCovariance,
             "covariance",
             None,
             "partial_compute",
diff --git a/onedal/decomposition/incremental_pca.py b/onedal/decomposition/incremental_pca.py
@@ -100,7 +100,10 @@ def __init__(
 
     def _reset(self):
         self._need_to_finalize = False
-        module = self._get_backend("decomposition", "dim_reduction")
+        # Not supported with spmd policy so IncrementalPCA must be specified
+        module = IncrementalPCA._get_backend(
+            IncrementalPCA, "decomposition", "dim_reduction"
+        )
         if hasattr(self, "components_"):
             del self.components_
         self._partial_result = module.partial_train_result()
@@ -154,14 +157,17 @@ def partial_fit(self, X, queue):
 
         self._queue = queue
 
-        policy = self._get_policy(queue, X)
+        # Not supported with spmd policy so IncrementalPCA must be specified
+        policy = IncrementalPCA._get_policy(IncrementalPCA, queue, X)
         X_table = to_table(X, queue=queue)
 
         if not hasattr(self, "_dtype"):
             self._dtype = X_table.dtype
             self._params = self._get_onedal_params(X_table)
 
-        self._partial_result = self._get_backend(
+        # Not supported with spmd policy so IncrementalPCA must be specified
+        self._partial_result = IncrementalPCA._get_backend(
+            IncrementalPCA,
             "decomposition",
             "dim_reduction",
             "partial_train",
diff --git a/onedal/decomposition/pca.py b/onedal/decomposition/pca.py
@@ -119,7 +119,8 @@ def _compute_noise_variance(self, n_components, n_sf_min):
             return 0.0
 
     def _create_model(self):
-        m = self._get_backend("decomposition", "dim_reduction", "model")
+        # Not supported with spmd policy so BasePCA must be specified
+        m = BasePCA._get_backend(BasePCA, "decomposition", "dim_reduction", "model")
         m.eigenvectors = to_table(self.components_)
         m.means = to_table(self.mean_)
         if self.whiten:
@@ -128,16 +129,27 @@ def _create_model(self):
         return m
 
     def predict(self, X, queue=None):
-        policy = self._get_policy(queue, X)
+        # Not supported with spmd policy so BasePCA must be specified
+        policy = BasePCA._get_policy(BasePCA, queue, X)
         model = self._create_model()
         X_table = to_table(X, queue=queue)
         params = self._get_onedal_params(X_table, stage="predict")
 
-        result = self._get_backend(
-            "decomposition", "dim_reduction", "infer", policy, params, model, X_table
+        # Not supported with spmd policy so BasePCA must be specified
+        result = BasePCA._get_backend(
+            BasePCA,
+            "decomposition",
+            "dim_reduction",
+            "infer",
+            policy,
+            params,
+            model,
+            X_table,
         )
         return from_table(result.transformed_data)
 
+    transform = predict
+
 
 class PCA(BasePCA):
 
diff --git a/onedal/decomposition/tests/test_incremental_pca.py b/onedal/decomposition/tests/test_incremental_pca.py
@@ -40,7 +40,7 @@ def test_on_gold_data(queue, is_deterministic, whiten, num_blocks, dtype):
 
     result = incpca.finalize_fit()
 
-    transformed_data = incpca.predict(X, queue=queue)
+    transformed_data = incpca.transform(X, queue=queue)
 
     expected_n_components_ = 2
     expected_components_ = np.array([[0.83849224, 0.54491354], [-0.54491354, 0.83849224]])
@@ -128,7 +128,7 @@ def test_on_random_data(
 
     incpca.finalize_fit()
 
-    transformed_data = incpca.predict(X, queue=queue)
+    transformed_data = incpca.transform(X, queue=queue)
     tol = 3e-3 if transformed_data.dtype == np.float32 else 2e-6
 
     n_components = incpca.n_components_
diff --git a/onedal/linear_model/incremental_linear_model.py b/onedal/linear_model/incremental_linear_model.py
@@ -48,8 +48,12 @@ def __init__(self, fit_intercept=True, copy_X=False, algorithm="norm_eq"):
 
     def _reset(self):
         self._need_to_finalize = False
-        self._partial_result = self._get_backend(
-            "linear_model", "regression", "partial_train_result"
+        # Not supported with spmd policy so IncrementalLinearRegression must be specified
+        self._partial_result = IncrementalLinearRegression._get_backend(
+            IncrementalLinearRegression,
+            "linear_model",
+            "regression",
+            "partial_train_result",
         )
 
     def __getstate__(self):
@@ -84,10 +88,16 @@ def partial_fit(self, X, y, queue=None):
         self : object
             Returns the instance itself.
         """
-        module = self._get_backend("linear_model", "regression")
+        # Not supported with spmd policy so IncrementalLinearRegression must be specified
+        module = IncrementalLinearRegression._get_backend(
+            IncrementalLinearRegression, "linear_model", "regression"
+        )
 
         self._queue = queue
-        policy = self._get_policy(queue, X)
+        # Not supported with spmd policy so IncrementalLinearRegression must be specified
+        policy = IncrementalLinearRegression._get_policy(
+            IncrementalLinearRegression, queue, X
+        )
 
         X, y = _check_X_y(
             X, y, dtype=[np.float64, np.float32], accept_2d_y=True, force_all_finite=False
diff --git a/onedal/spmd/basic_statistics/incremental_basic_statistics.py b/onedal/spmd/basic_statistics/incremental_basic_statistics.py
@@ -14,58 +14,11 @@
 # limitations under the License.
 # ==============================================================================
 
-from daal4py.sklearn._utils import get_dtype
-
 from ...basic_statistics import (
     IncrementalBasicStatistics as base_IncrementalBasicStatistics,
 )
-from ...datatypes import to_table
 from .._base import BaseEstimatorSPMD
 
 
 class IncrementalBasicStatistics(BaseEstimatorSPMD, base_IncrementalBasicStatistics):
-    def _reset(self):
-        self._need_to_finalize = False
-        self._partial_result = super(base_IncrementalBasicStatistics, self)._get_backend(
-            "basic_statistics", None, "partial_compute_result"
-        )
-
-    def partial_fit(self, X, weights=None, queue=None):
-        """
-        Computes partial data for basic statistics
-        from data batch X and saves it to `_partial_result`.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            Training data batch, where `n_samples` is the number of samples
-            in the batch, and `n_features` is the number of features.
-
-        queue : dpctl.SyclQueue
-            If not None, use this queue for computations.
-
-        Returns
-        -------
-        self : object
-            Returns the instance itself.
-        """
-        self._queue = queue
-        policy = super(base_IncrementalBasicStatistics, self)._get_policy(queue, X)
-        X_table, weights_table = to_table(X, weights, queue=queue)
-
-        if not hasattr(self, "_onedal_params"):
-            self._onedal_params = self._get_onedal_params(False, dtype=X_table.dtype)
-
-        self._partial_result = super(base_IncrementalBasicStatistics, self)._get_backend(
-            "basic_statistics",
-            None,
-            "partial_compute",
-            policy,
-            self._onedal_params,
-            self._partial_result,
-            X_table,
-            weights_table,
-        )
-
-        self._need_to_finalize = True
-        return self
+    pass
diff --git a/onedal/spmd/covariance/incremental_covariance.py b/onedal/spmd/covariance/incremental_covariance.py
@@ -14,70 +14,13 @@
 # limitations under the License.
 # ==============================================================================
 
-import numpy as np
-
-from daal4py.sklearn._utils import get_dtype
-
 from ...covariance import (
     IncrementalEmpiricalCovariance as base_IncrementalEmpiricalCovariance,
 )
-from ...datatypes import to_table
-from ...utils import _check_array
 from .._base import BaseEstimatorSPMD
 
 
 class IncrementalEmpiricalCovariance(
     BaseEstimatorSPMD, base_IncrementalEmpiricalCovariance
 ):
-    def _reset(self):
-        self._need_to_finalize = False
-        self._partial_result = super(
-            base_IncrementalEmpiricalCovariance, self
-        )._get_backend("covariance", None, "partial_compute_result")
-
-    def partial_fit(self, X, y=None, queue=None):
-        """
-        Computes partial data for the covariance matrix
-        from data batch X and saves it to `_partial_result`.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            Training data batch, where `n_samples` is the number of samples
-            in the batch, and `n_features` is the number of features.
-
-        y : Ignored
-            Not used, present for API consistency by convention.
-
-        queue : dpctl.SyclQueue
-            If not None, use this queue for computations.
-
-        Returns
-        -------
-        self : object
-            Returns the instance itself.
-        """
-        X = _check_array(X, dtype=[np.float64, np.float32], ensure_2d=True)
-
-        self._queue = queue
-
-        policy = super(base_IncrementalEmpiricalCovariance, self)._get_policy(queue, X)
-
-        X_table = to_table(X, queue=queue)
-
-        if not hasattr(self, "_dtype"):
-            self._dtype = X_table.dtype
-
-        params = self._get_onedal_params(self._dtype)
-        self._partial_result = super(
-            base_IncrementalEmpiricalCovariance, self
-        )._get_backend(
-            "covariance",
-            None,
-            "partial_compute",
-            policy,
-            params,
-            self._partial_result,
-            X_table,
-        )
-        self._need_to_finalize = True
+    pass
diff --git a/onedal/spmd/decomposition/incremental_pca.py b/onedal/spmd/decomposition/incremental_pca.py
diff --git a/onedal/spmd/linear_model/incremental_linear_model.py b/onedal/spmd/linear_model/incremental_linear_model.py