Skip to content

Commit

Permalink
ENH: SPMD interface for IncrementalPCA (#1979)
Browse files Browse the repository at this point in the history
  • Loading branch information
olegkkruglov authored Sep 5, 2024
1 parent d9f46b7 commit 9f63db2
Show file tree
Hide file tree
Showing 8 changed files with 458 additions and 27 deletions.
46 changes: 29 additions & 17 deletions onedal/decomposition/incremental_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,14 @@ def __init__(
self.method = method
self.is_deterministic = is_deterministic
self.whiten = whiten
module = self._get_backend("decomposition", "dim_reduction")
self._partial_result = module.partial_train_result()
self._reset()

def _reset(self):
module = self._get_backend("decomposition", "dim_reduction")
del self.components_
self._partial_result = module.partial_train_result()
self._partial_result = self._get_backend(
"decomposition", "dim_reduction", "partial_train_result"
)
if hasattr(self, "components_"):
del self.components_

def partial_fit(self, X, queue):
"""Incremental fit with X. All of X is processed as a single batch.
Expand All @@ -116,9 +117,6 @@ def partial_fit(self, X, queue):
y : Ignored
Not used, present for API consistency by convention.
check_input : bool, default=True
Run check_array on X.
Returns
-------
self : object
Expand All @@ -143,20 +141,24 @@ def partial_fit(self, X, queue):
else:
self.n_components_ = self.n_components

module = self._get_backend("decomposition", "dim_reduction")

if not hasattr(self, "_policy"):
self._policy = self._get_policy(queue, X)
self._queue = queue

X = _convert_to_supported(self._policy, X)
policy = self._get_policy(queue, X)
X = _convert_to_supported(policy, X)

if not hasattr(self, "_dtype"):
self._dtype = get_dtype(X)
self._params = self._get_onedal_params(X)

X_table = to_table(X)
self._partial_result = module.partial_train(
self._policy, self._params, self._partial_result, X_table
self._partial_result = self._get_backend(
"decomposition",
"dim_reduction",
"partial_train",
policy,
self._params,
self._partial_result,
X_table,
)
return self

Expand All @@ -175,8 +177,18 @@ def finalize_fit(self, queue=None):
self : object
Returns the instance itself.
"""
module = self._get_backend("decomposition", "dim_reduction")
result = module.finalize_train(self._policy, self._params, self._partial_result)
if queue is not None:
policy = self._get_policy(queue)
else:
policy = self._get_policy(self._queue)
result = self._get_backend(
"decomposition",
"dim_reduction",
"finalize_train",
policy,
self._params,
self._partial_result,
)
self.mean_ = from_table(result.means).ravel()
self.var_ = from_table(result.variances).ravel()
self.components_ = from_table(result.eigenvectors)
Expand Down
1 change: 1 addition & 0 deletions onedal/decomposition/pca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ ONEDAL_PY_INIT_MODULE(decomposition) {
auto sub = m.def_submodule("decomposition");
#ifdef ONEDAL_DATA_PARALLEL_SPMD
ONEDAL_PY_INSTANTIATE(init_train_ops, sub, policy_spmd, task_list);
ONEDAL_PY_INSTANTIATE(init_finalize_train_ops, sub, policy_spmd, task_list);
#else
ONEDAL_PY_INSTANTIATE(init_train_ops, sub, policy_list, task_list);
ONEDAL_PY_INSTANTIATE(init_infer_ops, sub, policy_list, task_list);
Expand Down
3 changes: 2 additions & 1 deletion onedal/spmd/decomposition/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.
# ==============================================================================

from .incremental_pca import IncrementalPCA
from .pca import PCA

__all__ = ["PCA"]
__all__ = ["IncrementalPCA", "PCA"]
117 changes: 117 additions & 0 deletions onedal/spmd/decomposition/incremental_pca.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# ==============================================================================
# Copyright 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from daal4py.sklearn._utils import get_dtype

from ...datatypes import _convert_to_supported, from_table, to_table
from ...decomposition import IncrementalPCA as base_IncrementalPCA
from ...utils import _check_array
from .._base import BaseEstimatorSPMD


class IncrementalPCA(BaseEstimatorSPMD, base_IncrementalPCA):
"""
Distributed incremental estimator for PCA based on oneDAL implementation.
Allows for distributed PCA computation if data is split into batches.
API is the same as for `onedal.decomposition.IncrementalPCA`
"""

def _reset(self):
self._partial_result = super(base_IncrementalPCA, self)._get_backend(
"decomposition", "dim_reduction", "partial_train_result"
)
if hasattr(self, "components_"):
del self.components_

def partial_fit(self, X, y=None, queue=None):
"""Incremental fit with X. All of X is processed as a single batch.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data, where `n_samples` is the number of samples and
`n_features` is the number of features.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : object
Returns the instance itself.
"""
X = _check_array(X)
n_samples, n_features = X.shape

first_pass = not hasattr(self, "components_")
if first_pass:
self.components_ = None
self.n_samples_seen_ = n_samples
self.n_features_in_ = n_features
else:
self.n_samples_seen_ += n_samples

if self.n_components is None:
if self.components_ is None:
self.n_components_ = min(n_samples, n_features)
else:
self.n_components_ = self.components_.shape[0]
else:
self.n_components_ = self.n_components

self._queue = queue

policy = super(base_IncrementalPCA, self)._get_policy(queue, X)
X = _convert_to_supported(policy, X)

if not hasattr(self, "_dtype"):
self._dtype = get_dtype(X)
self._params = self._get_onedal_params(X)

X_table = to_table(X)
self._partial_result = super(base_IncrementalPCA, self)._get_backend(
"decomposition",
"dim_reduction",
"partial_train",
policy,
self._params,
self._partial_result,
X_table,
)
return self

def _create_model(self):
m = super(base_IncrementalPCA, self)._get_backend(
"decomposition", "dim_reduction", "model"
)
m.eigenvectors = to_table(self.components_)
m.means = to_table(self.mean_)
if self.whiten:
m.eigenvalues = to_table(self.explained_variance_)
self._onedal_model = m
return m

def predict(self, X, queue=None):
policy = super(base_IncrementalPCA, self)._get_policy(queue, X)
model = self._create_model()
X = _convert_to_supported(policy, X)
params = self._get_onedal_params(X, stage="predict")

result = super(base_IncrementalPCA, self)._get_backend(
"decomposition", "dim_reduction", "infer", policy, params, model, to_table(X)
)
return from_table(result.transformed_data)
16 changes: 8 additions & 8 deletions sklearnex/preview/decomposition/incremental_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def _onedal_fit_transform(self, X, queue=None):
return self._onedal_transform(X, queue)

def _onedal_partial_fit(self, X, check_input=True, queue=None):
first_pass = not hasattr(self, "components_")
first_pass = not hasattr(self, "_onedal_estimator")

if check_input:
if sklearn_check_version("1.0"):
Expand All @@ -78,10 +78,10 @@ def _onedal_partial_fit(self, X, check_input=True, queue=None):
n_samples, n_features = X.shape

if self.n_components is None:
if not hasattr(self, "components_"):
if not hasattr(self, "_components_shape"):
self.n_components_ = min(n_samples, n_features)
else:
self.n_components_ = self.components_.shape[0]
self._components_shape = self.n_components_

elif not self.n_components <= n_features:
raise ValueError(
"n_components=%r invalid for n_features=%d, need "
Expand All @@ -106,12 +106,12 @@ def _onedal_partial_fit(self, X, check_input=True, queue=None):

if not hasattr(self, "_onedal_estimator"):
self._onedal_estimator = self._onedal_incremental_pca(**onedal_params)
self._onedal_estimator.partial_fit(X, queue)
self._onedal_estimator.partial_fit(X, queue=queue)
self._need_to_finalize = True

def _onedal_finalize_fit(self):
def _onedal_finalize_fit(self, queue=None):
assert hasattr(self, "_onedal_estimator")
self._onedal_estimator.finalize_fit()
self._onedal_estimator.finalize_fit(queue=queue)
self._need_to_finalize = False

def _onedal_fit(self, X, queue=None):
Expand Down Expand Up @@ -142,7 +142,7 @@ def _onedal_fit(self, X, queue=None):
X_batch = X[batch]
self._onedal_partial_fit(X_batch, queue=queue)

self._onedal_finalize_fit()
self._onedal_finalize_fit(queue=queue)

return self

Expand Down
3 changes: 2 additions & 1 deletion sklearnex/spmd/decomposition/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.
# ==============================================================================

from .incremental_pca import IncrementalPCA
from .pca import PCA

__all__ = ["PCA"]
__all__ = ["IncrementalPCA", "PCA"]
30 changes: 30 additions & 0 deletions sklearnex/spmd/decomposition/incremental_pca.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# ==============================================================================
# Copyright 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from onedal.spmd.decomposition import IncrementalPCA as onedalSPMD_IncrementalPCA

from ...preview.decomposition import IncrementalPCA as base_IncrementalPCA


class IncrementalPCA(base_IncrementalPCA):
"""
Distributed incremental estimator for PCA based on sklearnex implementation.
Allows for distributed PCA computation if data is split into batches.
API is the same as for `sklearnex.decomposition.IncrementalPCA`
"""

_onedal_incremental_pca = staticmethod(onedalSPMD_IncrementalPCA)
Loading

0 comments on commit 9f63db2

Please sign in to comment.