Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhancement] WIP new finite checking in EmpiricialCovariance and IncrementalEmpiricalCovariance #2207

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions onedal/covariance/covariance.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@

import numpy as np

from daal4py.sklearn._utils import daal_check_version, get_dtype
from onedal.utils import _check_array
from daal4py.sklearn._utils import daal_check_version

from ..common._base import BaseEstimator
from ..common.hyperparameters import get_hyperparameters
Expand Down Expand Up @@ -94,10 +93,8 @@ def fit(self, X, y=None, queue=None):
Returns the instance itself.
"""
policy = self._get_policy(queue, X)
X = _check_array(X, dtype=[np.float64, np.float32])
X = _convert_to_supported(policy, X)
dtype = get_dtype(X)
params = self._get_onedal_params(dtype)
X_table = to_table(_convert_to_supported(policy, X))
params = self._get_onedal_params(X_table.dtype)
hparams = get_hyperparameters("covariance", "compute")
if hparams is not None and not hparams.is_default:
result = self._get_backend(
Expand All @@ -107,7 +104,7 @@ def fit(self, X, y=None, queue=None):
policy,
params,
hparams.backend,
to_table(X),
X_table,
)
else:
result = self._get_backend(
Expand Down
14 changes: 4 additions & 10 deletions onedal/covariance/incremental_covariance.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ===============================================================================
import numpy as np

from daal4py.sklearn._utils import daal_check_version, get_dtype
from daal4py.sklearn._utils import daal_check_version

from ..datatypes import _convert_to_supported, from_table, to_table
from ..utils import _check_array
from .covariance import BaseEmpiricalCovariance


Expand Down Expand Up @@ -95,27 +93,23 @@ def partial_fit(self, X, y=None, queue=None):
self : object
Returns the instance itself.
"""
X = _check_array(X, dtype=[np.float64, np.float32], ensure_2d=True)

self._queue = queue

policy = self._get_policy(queue, X)

X = _convert_to_supported(policy, X)

X_table = to_table(_convert_to_supported(policy, X))
if not hasattr(self, "_dtype"):
self._dtype = get_dtype(X)
self._dtype = X_table.dtype

params = self._get_onedal_params(self._dtype)
table_X = to_table(X)
self._partial_result = self._get_backend(
"covariance",
None,
"partial_compute",
policy,
params,
self._partial_result,
table_X,
X_table,
)
self._need_to_finalize = True

Expand Down
6 changes: 3 additions & 3 deletions onedal/covariance/tests/test_covariance.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,23 +25,23 @@
def test_onedal_import_covariance(queue):
from onedal.covariance import EmpiricalCovariance

X = np.array([[0, 1], [0, 1]])
X = np.array([[0, 1], [0, 1]], dtype=np.float64)
result = EmpiricalCovariance().fit(X, queue=queue)
expected_covariance = np.array([[0, 0], [0, 0]])
expected_means = np.array([0, 1])

assert_allclose(expected_covariance, result.covariance_)
assert_allclose(expected_means, result.location_)

X = np.array([[1, 2], [3, 6]])
X = np.array([[1, 2], [3, 6]], dtype=np.float64)
result = EmpiricalCovariance().fit(X, queue=queue)
expected_covariance = np.array([[2, 4], [4, 8]])
expected_means = np.array([2, 4])

assert_allclose(expected_covariance, result.covariance_)
assert_allclose(expected_means, result.location_)

X = np.array([[1, 2], [3, 6]])
X = np.array([[1, 2], [3, 6]], dtype=np.float64)
result = EmpiricalCovariance(bias=True).fit(X, queue=queue)
expected_covariance = np.array([[1, 2], [2, 4]])
expected_means = np.array([2, 4])
Expand Down
40 changes: 16 additions & 24 deletions sklearnex/covariance/incremental_covariance.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,17 @@
from onedal.covariance import (
IncrementalEmpiricalCovariance as onedal_IncrementalEmpiricalCovariance,
)
from sklearnex import config_context
from onedal.utils._array_api import _is_numpy_namespace

from .._device_offload import dispatch, wrap_output_data
from .._utils import IntelEstimator, PatchingConditionsChain, register_hyperparameters
from ..metrics import pairwise_distances
from ..utils._array_api import get_namespace
from ..utils.validation import validate_data

if sklearn_check_version("1.2"):
from sklearn.utils._param_validation import Interval

if sklearn_check_version("1.6"):
from sklearn.utils.validation import validate_data
else:
validate_data = BaseEstimator._validate_data


@control_n_jobs(decorated_methods=["partial_fit", "fit", "_onedal_finalize_fit"])
class IncrementalEmpiricalCovariance(IntelEstimator, BaseEstimator):
Expand Down Expand Up @@ -152,8 +148,9 @@ def _onedal_finalize_fit(self, queue=None):

if not daal_check_version((2024, "P", 400)) and self.assume_centered:
location = self._onedal_estimator.location_[None, :]
self._onedal_estimator.covariance_ += np.dot(location.T, location)
self._onedal_estimator.location_ = np.zeros_like(np.squeeze(location))
lp, _ = get_namespace(location)
self._onedal_estimator.covariance_ += lp.dot(location.T, location)
self._onedal_estimator.location_ = lp.zeros_like(lp.squeeze(location))
if self.store_precision:
self.precision_ = linalg.pinvh(
self._onedal_estimator.covariance_, check_finite=False
Expand Down Expand Up @@ -187,26 +184,24 @@ def _onedal_partial_fit(self, X, queue=None, check_input=True):

first_pass = not hasattr(self, "n_samples_seen_") or self.n_samples_seen_ == 0

# finite check occurs on onedal side
if check_input:
xp, _ = get_namespace(X)
if sklearn_check_version("1.2"):
self._validate_params()

if sklearn_check_version("1.0"):
X = validate_data(
self,
X,
dtype=[np.float64, np.float32],
dtype=[xp.float64, xp.float32],
reset=first_pass,
copy=self.copy,
force_all_finite=False,
)
else:
X = check_array(
X,
dtype=[np.float64, np.float32],
dtype=[xp.float64, xp.float32],
copy=self.copy,
force_all_finite=False,
)

onedal_params = {
Expand Down Expand Up @@ -239,16 +234,16 @@ def score(self, X_test, y=None):
X = validate_data(
self,
X_test,
dtype=[np.float64, np.float32],
dtype=[xp.float64, xp.float32],
reset=False,
)
else:
X = check_array(
X_test,
dtype=[np.float64, np.float32],
dtype=[xp.float64, xp.float32],
)

if "numpy" not in xp.__name__:
if not _is_numpy_namespace(xp):
location = xp.asarray(location, device=X_test.device)
# depending on the sklearn version, check_array
# and validate_data will return only numpy arrays
Expand Down Expand Up @@ -337,19 +332,16 @@ def _onedal_fit(self, X, queue=None):
if sklearn_check_version("1.2"):
self._validate_params()

# finite check occurs on onedal side
xp, _ = get_namespace(X)
if sklearn_check_version("1.0"):
X = validate_data(
self,
X,
dtype=[np.float64, np.float32],
dtype=[xp.float64, xp.float32],
copy=self.copy,
force_all_finite=False,
)
else:
X = check_array(
X, dtype=[np.float64, np.float32], copy=self.copy, force_all_finite=False
)
X = check_array(X, dtype=[xp.float64, xp.float32], copy=self.copy)
self.n_features_in_ = X.shape[1]

self.batch_size_ = self.batch_size if self.batch_size else 5 * self.n_features_in_
Expand Down Expand Up @@ -378,8 +370,8 @@ def mahalanobis(self, X):
# pairwise_distances will check n_features (via n_feature matching with
# self.location_) , and will check for finiteness via check array
# check_feature_names will match _validate_data functionally
location = self.location_[np.newaxis, :]
if "numpy" not in xp.__name__:
location = self.location_[None, :]
if not _is_numpy_namespace(xp):
# Guarantee that inputs to pairwise_distances match in type and location
location = xp.asarray(location, device=X.device)

Expand Down
38 changes: 18 additions & 20 deletions sklearnex/preview/covariance/covariance.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@

import warnings

import numpy as np
from scipy import sparse as sp
import scipy.sparse as sp
from sklearn.covariance import EmpiricalCovariance as _sklearn_EmpiricalCovariance
from sklearn.utils import check_array

Expand All @@ -30,11 +29,8 @@

from ..._device_offload import dispatch, wrap_output_data
from ..._utils import PatchingConditionsChain, register_hyperparameters

if sklearn_check_version("1.6"):
from sklearn.utils.validation import validate_data
else:
validate_data = _sklearn_EmpiricalCovariance._validate_data
from ...utils._array_api import get_namespace
from ...utils.validation import validate_data


@register_hyperparameters({"fit": get_hyperparameters("covariance", "compute")})
Expand All @@ -51,14 +47,23 @@ def _save_attributes(self):
assert hasattr(self, "_onedal_estimator")
if not daal_check_version((2024, "P", 400)) and self.assume_centered:
location = self._onedal_estimator.location_[None, :]
self._onedal_estimator.covariance_ += np.dot(location.T, location)
self._onedal_estimator.location_ = np.zeros_like(np.squeeze(location))
lp, _ = get_namespace(location)
self._onedal_estimator.covariance_ += lp.dot(location.T, location)
self._onedal_estimator.location_ = lp.zeros_like(lp.squeeze(location))
self._set_covariance(self._onedal_estimator.covariance_)
self.location_ = self._onedal_estimator.location_

_onedal_covariance = staticmethod(onedal_EmpiricalCovariance)

def _onedal_fit(self, X, queue=None):
xp, _ = get_namespace(X)
if sklearn_check_version("1.2"):
self._validate_params()
if sklearn_check_version("1.0"):
X = validate_data(self, X, dtype=[xp.float64, xp.float32])
else:
X = check_array(X)

if X.shape[0] == 1:
warnings.warn(
"Only one sample available. You may want to reshape your data array"
Expand Down Expand Up @@ -93,13 +98,6 @@ def _onedal_supported(self, method_name, *data):
_onedal_gpu_supported = _onedal_supported

def fit(self, X, y=None):
if sklearn_check_version("1.2"):
self._validate_params()
if sklearn_check_version("0.23"):
X = validate_data(self, X, force_all_finite=False)
else:
X = check_array(X, force_all_finite=False)

dispatch(
self,
"fit",
Expand All @@ -113,21 +111,21 @@ def fit(self, X, y=None):
return self

# expose sklearnex pairwise_distances if mahalanobis distance eventually supported
@wrap_output_data
def mahalanobis(self, X):
xp, _ = get_namespace(X)
if sklearn_check_version("1.0"):
X = validate_data(self, X, reset=False)
X = validate_data(self, X, reset=False, dtype=[xp.float64, xp.float32])
else:
X = check_array(X)

precision = self.get_precision()
with config_context(assume_finite=True):
# compute mahalanobis distances
dist = pairwise_distances(
X, self.location_[np.newaxis, :], metric="mahalanobis", VI=precision
X, self.location_[None, :], metric="mahalanobis", VI=precision
)

return np.reshape(dist, (len(X),)) ** 2
return xp.reshape(dist, (len(X),)) ** 2

error_norm = wrap_output_data(_sklearn_EmpiricalCovariance.error_norm)
score = wrap_output_data(_sklearn_EmpiricalCovariance.score)
Expand Down
11 changes: 7 additions & 4 deletions sklearnex/spmd/covariance/covariance.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@
# limitations under the License.
# ==============================================================================

from onedal.spmd.covariance import EmpiricalCovariance
from onedal.spmd.covariance import EmpiricalCovariance as onedal_EmpiricalCovariance

# TODO:
# Currently it uses `onedal` module interface.
# Add sklearnex dispatching.
from ...preview.covariance import EmpiricalCovariance as EmpiricalCovariance_Batch


class EmpiricalCovariance(EmpiricalCovariance_Batch):
__doc__ = EmpiricalCovariance_Batch.__doc__
_onedal_covariance = staticmethod(onedal_EmpiricalCovariance)
45 changes: 7 additions & 38 deletions sklearnex/tests/test_memory_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,14 @@
get_dataframes_and_queues,
)
from onedal.tests.utils._device_selection import get_queues, is_dpctl_device_available
from onedal.utils._array_api import _get_sycl_namespace
from onedal.utils._dpep_helpers import dpctl_available, dpnp_available
from sklearnex import config_context
from sklearnex.tests.utils import PATCHED_FUNCTIONS, PATCHED_MODELS, SPECIAL_INSTANCES
from sklearnex.tests.utils import (
PATCHED_FUNCTIONS,
PATCHED_MODELS,
SPECIAL_INSTANCES,
DummyEstimator,
)
from sklearnex.utils._array_api import get_namespace

if dpctl_available:
Expand Down Expand Up @@ -131,41 +135,6 @@ def gen_functions(functions):
ORDER_DICT = {"F": np.asfortranarray, "C": np.ascontiguousarray}


if _is_dpc_backend:

from sklearn.utils.validation import check_is_fitted

from onedal.datatypes import from_table, to_table

class DummyEstimatorWithTableConversions(BaseEstimator):

def fit(self, X, y=None):
sua_iface, xp, _ = _get_sycl_namespace(X)
X_table = to_table(X)
y_table = to_table(y)
# The presence of the fitted attributes (ending with a trailing
# underscore) is required for the correct check. The cleanup of
# the memory will occur at the estimator instance deletion.
self.x_attr_ = from_table(
X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp
)
self.y_attr_ = from_table(
y_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp
)
return self

def predict(self, X):
# Checks if the estimator is fitted by verifying the presence of
# fitted attributes (ending with a trailing underscore).
check_is_fitted(self)
sua_iface, xp, _ = _get_sycl_namespace(X)
X_table = to_table(X)
returned_X = from_table(
X_table, sua_iface=sua_iface, sycl_queue=X.sycl_queue, xp=xp
)
return returned_X


def gen_clsf_data(n_samples, n_features, dtype=None):
data, label = make_classification(
n_classes=2, n_samples=n_samples, n_features=n_features, random_state=777
Expand Down Expand Up @@ -369,7 +338,7 @@ def test_table_conversions_memory_leaks(dataframe, queue, order, data_shape, dty
pytest.skip("SYCL device memory leak check requires the level zero sysman")

_kfold_function_template(
DummyEstimatorWithTableConversions,
DummyEstimator,
dataframe,
data_shape,
queue,
Expand Down
Loading