ENH: SPMD interface for IncrementalPCA (#1979)

uxlfoundation · Sep 5, 2024 · 9f63db2 · 9f63db2
1 parent d9f46b7
commit 9f63db2
Show file tree

Hide file tree

Showing 8 changed files with 458 additions and 27 deletions.
diff --git a/onedal/decomposition/incremental_pca.py b/onedal/decomposition/incremental_pca.py
@@ -96,13 +96,14 @@ def __init__(
         self.method = method
         self.is_deterministic = is_deterministic
         self.whiten = whiten
-        module = self._get_backend("decomposition", "dim_reduction")
-        self._partial_result = module.partial_train_result()
+        self._reset()
 
     def _reset(self):
-        module = self._get_backend("decomposition", "dim_reduction")
-        del self.components_
-        self._partial_result = module.partial_train_result()
+        self._partial_result = self._get_backend(
+            "decomposition", "dim_reduction", "partial_train_result"
+        )
+        if hasattr(self, "components_"):
+            del self.components_
 
     def partial_fit(self, X, queue):
         """Incremental fit with X. All of X is processed as a single batch.
@@ -116,9 +117,6 @@ def partial_fit(self, X, queue):
         y : Ignored
             Not used, present for API consistency by convention.
 
-        check_input : bool, default=True
-            Run check_array on X.
-
         Returns
         -------
         self : object
@@ -143,20 +141,24 @@ def partial_fit(self, X, queue):
         else:
             self.n_components_ = self.n_components
 
-        module = self._get_backend("decomposition", "dim_reduction")
-
-        if not hasattr(self, "_policy"):
-            self._policy = self._get_policy(queue, X)
+        self._queue = queue
 
-        X = _convert_to_supported(self._policy, X)
+        policy = self._get_policy(queue, X)
+        X = _convert_to_supported(policy, X)
 
         if not hasattr(self, "_dtype"):
             self._dtype = get_dtype(X)
             self._params = self._get_onedal_params(X)
 
         X_table = to_table(X)
-        self._partial_result = module.partial_train(
-            self._policy, self._params, self._partial_result, X_table
+        self._partial_result = self._get_backend(
+            "decomposition",
+            "dim_reduction",
+            "partial_train",
+            policy,
+            self._params,
+            self._partial_result,
+            X_table,
         )
         return self
 
@@ -175,8 +177,18 @@ def finalize_fit(self, queue=None):
         self : object
             Returns the instance itself.
         """
-        module = self._get_backend("decomposition", "dim_reduction")
-        result = module.finalize_train(self._policy, self._params, self._partial_result)
+        if queue is not None:
+            policy = self._get_policy(queue)
+        else:
+            policy = self._get_policy(self._queue)
+        result = self._get_backend(
+            "decomposition",
+            "dim_reduction",
+            "finalize_train",
+            policy,
+            self._params,
+            self._partial_result,
+        )
         self.mean_ = from_table(result.means).ravel()
         self.var_ = from_table(result.variances).ravel()
         self.components_ = from_table(result.eigenvectors)

diff --git a/onedal/decomposition/pca.cpp b/onedal/decomposition/pca.cpp
@@ -207,6 +207,7 @@ ONEDAL_PY_INIT_MODULE(decomposition) {
     auto sub = m.def_submodule("decomposition");
     #ifdef ONEDAL_DATA_PARALLEL_SPMD
         ONEDAL_PY_INSTANTIATE(init_train_ops, sub, policy_spmd, task_list);
+        ONEDAL_PY_INSTANTIATE(init_finalize_train_ops, sub, policy_spmd, task_list);
     #else  
         ONEDAL_PY_INSTANTIATE(init_train_ops, sub, policy_list, task_list);
         ONEDAL_PY_INSTANTIATE(init_infer_ops, sub, policy_list, task_list);

diff --git a/onedal/spmd/decomposition/__init__.py b/onedal/spmd/decomposition/__init__.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 
+from .incremental_pca import IncrementalPCA
 from .pca import PCA
 
-__all__ = ["PCA"]
+__all__ = ["IncrementalPCA", "PCA"]
diff --git a/onedal/spmd/decomposition/incremental_pca.py b/onedal/spmd/decomposition/incremental_pca.py
@@ -0,0 +1,117 @@
+# ==============================================================================
+# Copyright 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from daal4py.sklearn._utils import get_dtype
+
+from ...datatypes import _convert_to_supported, from_table, to_table
+from ...decomposition import IncrementalPCA as base_IncrementalPCA
+from ...utils import _check_array
+from .._base import BaseEstimatorSPMD
+
+
+class IncrementalPCA(BaseEstimatorSPMD, base_IncrementalPCA):
+    """
+    Distributed incremental estimator for PCA based on oneDAL implementation.
+    Allows for distributed PCA computation if data is split into batches.
+
+    API is the same as for `onedal.decomposition.IncrementalPCA`
+    """
+
+    def _reset(self):
+        self._partial_result = super(base_IncrementalPCA, self)._get_backend(
+            "decomposition", "dim_reduction", "partial_train_result"
+        )
+        if hasattr(self, "components_"):
+            del self.components_
+
+    def partial_fit(self, X, y=None, queue=None):
+        """Incremental fit with X. All of X is processed as a single batch.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : Ignored
+            Not used, present for API consistency by convention.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        X = _check_array(X)
+        n_samples, n_features = X.shape
+
+        first_pass = not hasattr(self, "components_")
+        if first_pass:
+            self.components_ = None
+            self.n_samples_seen_ = n_samples
+            self.n_features_in_ = n_features
+        else:
+            self.n_samples_seen_ += n_samples
+
+        if self.n_components is None:
+            if self.components_ is None:
+                self.n_components_ = min(n_samples, n_features)
+            else:
+                self.n_components_ = self.components_.shape[0]
+        else:
+            self.n_components_ = self.n_components
+
+        self._queue = queue
+
+        policy = super(base_IncrementalPCA, self)._get_policy(queue, X)
+        X = _convert_to_supported(policy, X)
+
+        if not hasattr(self, "_dtype"):
+            self._dtype = get_dtype(X)
+            self._params = self._get_onedal_params(X)
+
+        X_table = to_table(X)
+        self._partial_result = super(base_IncrementalPCA, self)._get_backend(
+            "decomposition",
+            "dim_reduction",
+            "partial_train",
+            policy,
+            self._params,
+            self._partial_result,
+            X_table,
+        )
+        return self
+
+    def _create_model(self):
+        m = super(base_IncrementalPCA, self)._get_backend(
+            "decomposition", "dim_reduction", "model"
+        )
+        m.eigenvectors = to_table(self.components_)
+        m.means = to_table(self.mean_)
+        if self.whiten:
+            m.eigenvalues = to_table(self.explained_variance_)
+        self._onedal_model = m
+        return m
+
+    def predict(self, X, queue=None):
+        policy = super(base_IncrementalPCA, self)._get_policy(queue, X)
+        model = self._create_model()
+        X = _convert_to_supported(policy, X)
+        params = self._get_onedal_params(X, stage="predict")
+
+        result = super(base_IncrementalPCA, self)._get_backend(
+            "decomposition", "dim_reduction", "infer", policy, params, model, to_table(X)
+        )
+        return from_table(result.transformed_data)
diff --git a/sklearnex/preview/decomposition/incremental_pca.py b/sklearnex/preview/decomposition/incremental_pca.py
@@ -61,7 +61,7 @@ def _onedal_fit_transform(self, X, queue=None):
         return self._onedal_transform(X, queue)
 
     def _onedal_partial_fit(self, X, check_input=True, queue=None):
-        first_pass = not hasattr(self, "components_")
+        first_pass = not hasattr(self, "_onedal_estimator")
 
         if check_input:
             if sklearn_check_version("1.0"):
@@ -78,10 +78,10 @@ def _onedal_partial_fit(self, X, check_input=True, queue=None):
         n_samples, n_features = X.shape
 
         if self.n_components is None:
-            if not hasattr(self, "components_"):
+            if not hasattr(self, "_components_shape"):
                 self.n_components_ = min(n_samples, n_features)
-            else:
-                self.n_components_ = self.components_.shape[0]
+                self._components_shape = self.n_components_
+
         elif not self.n_components <= n_features:
             raise ValueError(
                 "n_components=%r invalid for n_features=%d, need "
@@ -106,12 +106,12 @@ def _onedal_partial_fit(self, X, check_input=True, queue=None):
 
         if not hasattr(self, "_onedal_estimator"):
             self._onedal_estimator = self._onedal_incremental_pca(**onedal_params)
-        self._onedal_estimator.partial_fit(X, queue)
+        self._onedal_estimator.partial_fit(X, queue=queue)
         self._need_to_finalize = True
 
-    def _onedal_finalize_fit(self):
+    def _onedal_finalize_fit(self, queue=None):
         assert hasattr(self, "_onedal_estimator")
-        self._onedal_estimator.finalize_fit()
+        self._onedal_estimator.finalize_fit(queue=queue)
         self._need_to_finalize = False
 
     def _onedal_fit(self, X, queue=None):
@@ -142,7 +142,7 @@ def _onedal_fit(self, X, queue=None):
             X_batch = X[batch]
             self._onedal_partial_fit(X_batch, queue=queue)
 
-        self._onedal_finalize_fit()
+        self._onedal_finalize_fit(queue=queue)
 
         return self
 

diff --git a/sklearnex/spmd/decomposition/__init__.py b/sklearnex/spmd/decomposition/__init__.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 
+from .incremental_pca import IncrementalPCA
 from .pca import PCA
 
-__all__ = ["PCA"]
+__all__ = ["IncrementalPCA", "PCA"]
diff --git a/sklearnex/spmd/decomposition/incremental_pca.py b/sklearnex/spmd/decomposition/incremental_pca.py
@@ -0,0 +1,30 @@
+# ==============================================================================
+# Copyright 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from onedal.spmd.decomposition import IncrementalPCA as onedalSPMD_IncrementalPCA
+
+from ...preview.decomposition import IncrementalPCA as base_IncrementalPCA
+
+
+class IncrementalPCA(base_IncrementalPCA):
+    """
+    Distributed incremental estimator for PCA based on sklearnex implementation.
+    Allows for distributed PCA computation if data is split into batches.
+
+    API is the same as for `sklearnex.decomposition.IncrementalPCA`
+    """
+
+    _onedal_incremental_pca = staticmethod(onedalSPMD_IncrementalPCA)