Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] ENH: Resample additional arrays apart from X and y #463

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion examples/over-sampling/plot_comparison_over_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ class FakeSampler(BaseSampler):

_sampling_type = 'bypass'

def _fit_resample(self, X, y):
def _fit_resample(self, X, y, sample_weight=None):
return X, y


Expand Down
42 changes: 35 additions & 7 deletions imblearn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
from sklearn.externals import six
from sklearn.preprocessing import label_binarize
from sklearn.utils import check_X_y
from sklearn.utils import check_consistent_length
from sklearn.utils import check_array

from .utils import check_sampling_strategy, check_target_type
from .utils.deprecation import deprecate_parameter
Expand Down Expand Up @@ -55,7 +57,7 @@ def fit(self, X, y):
self.sampling_strategy, y, self._sampling_type)
return self

def fit_resample(self, X, y):
def fit_resample(self, X, y, sample_weight=None):
"""Resample the dataset.

Parameters
Expand All @@ -66,24 +68,39 @@ def fit_resample(self, X, y):
y : array-like, shape (n_samples,)
Corresponding label for each sample in X.

sample_weight : array-like, shape (n_samples,) or None
Sample weights.


Returns
-------
X_resampled : {array-like, sparse matrix}, shape \
X_resampled : {ndarray, sparse matrix}, shape \
(n_samples_new, n_features)
The array containing the resampled data.

y_resampled : array-like, shape (n_samples_new,)
y_resampled : ndarray, shape (n_samples_new,)
The corresponding label of `X_resampled`.

sample_weight_resampled : ndarray, shape (n_samples_new,)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would rather have a dict of non-X,y returned. (Optionally? In scikit-learn I would rather this be mandatory so we don't need to handle both cases.)

Resampled sample weights. This output is returned only if
``sample_weight`` was not ``None``.

idx_resampled : ndarray, shape (n_samples_new,)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you explain why this should be returned from fit_resample, rather than stored as an attribute?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that it was some original design (before it was in scikit-learn). But actually it would be better to keep it as an attribute with the single fit_resample.

Indices of the selected features. This output is optional and only
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean the selected samples?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes

available for some sampler if ``return_indices=True``.

"""
self._deprecate_ratio()

X, y, binarize_y = self._check_X_y(X, y)
if sample_weight is not None:
sample_weight = check_array(sample_weight, ensure_2d=False)
check_consistent_length(X, y, sample_weight)

self.sampling_strategy_ = check_sampling_strategy(
self.sampling_strategy, y, self._sampling_type)

output = self._fit_resample(X, y)
output = self._fit_resample(X, y, sample_weight)

if binarize_y:
y_sampled = label_binarize(output[1], np.unique(y))
Expand All @@ -96,7 +113,7 @@ def fit_resample(self, X, y):
fit_sample = fit_resample

@abstractmethod
def _fit_resample(self, X, y):
def _fit_resample(self, X, y, sample_weight=None):
"""Base method defined in each sampler to defined the sampling
strategy.

Expand All @@ -108,14 +125,25 @@ def _fit_resample(self, X, y):
y : array-like, shape (n_samples,)
Corresponding label for each sample in X.

sample_weight : array-like, shape (n_samples,) or None
Sample weights.

Returns
-------
X_resampled : {ndarray, sparse matrix}, shape \
(n_samples_new, n_features)
The array containing the resampled data.

y_resampled : ndarray, shape (n_samples_new,)
The corresponding label of `X_resampled`
The corresponding label of `X_resampled`.

sample_weight_resampled : ndarray, shape (n_samples_new,)
Resampled sample weights. This output is returned only if
``sample_weight`` was not ``None``.

idx_resampled : ndarray, shape (n_samples_new,)
Indices of the selected features. This output is optional and only
available for some sampler if ``return_indices=True``.

"""
pass
Expand Down Expand Up @@ -243,7 +271,7 @@ def __init__(self, func=None, accept_sparse=True, kw_args=None):
self.kw_args = kw_args
self.logger = logging.getLogger(__name__)

def _fit_resample(self, X, y):
def _fit_resample(self, X, y, sample_weight=None):
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']
if self.accept_sparse else False)
func = _identity if self.func is None else self.func
Expand Down
6 changes: 3 additions & 3 deletions imblearn/combine/_smote_enn.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,11 @@ def _validate_estimator(self):
else:
self.enn_ = EditedNearestNeighbours(sampling_strategy='all')

def _fit_resample(self, X, y):
def _fit_resample(self, X, y, sample_weight=None):
self._validate_estimator()
y = check_target_type(y)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
self.sampling_strategy_ = self.sampling_strategy

X_res, y_res = self.smote_.fit_resample(X, y)
return self.enn_.fit_resample(X_res, y_res)
resampled_arrays = self.smote_.fit_resample(X, y, sample_weight)
return self.enn_.fit_resample(*resampled_arrays)
6 changes: 3 additions & 3 deletions imblearn/combine/_smote_tomek.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,11 @@ def _validate_estimator(self):
else:
self.tomek_ = TomekLinks(sampling_strategy='all')

def _fit_resample(self, X, y):
def _fit_resample(self, X, y, sample_weight=None):
self._validate_estimator()
y = check_target_type(y)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
self.sampling_strategy_ = self.sampling_strategy

X_res, y_res = self.smote_.fit_resample(X, y)
return self.tomek_.fit_resample(X_res, y_res)
resampled_arrays = self.smote_.fit_resample(X, y, sample_weight)
return self.tomek_.fit_resample(*resampled_arrays)
17 changes: 9 additions & 8 deletions imblearn/ensemble/_bagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def __init__(self,
self.ratio = ratio
self.replacement = replacement

def _validate_estimator(self, default=DecisionTreeClassifier()):
def _validate_estimator(self):
"""Check the estimator and the n_estimator attribute, set the
`base_estimator_` attribute."""
if not isinstance(self.n_estimators, (numbers.Integral, np.integer)):
Expand All @@ -224,12 +224,14 @@ def _validate_estimator(self, default=DecisionTreeClassifier()):
if self.base_estimator is not None:
base_estimator = clone(self.base_estimator)
else:
base_estimator = clone(default)
base_estimator = clone(DecisionTreeClassifier())

self.base_estimator_ = Pipeline([('sampler', RandomUnderSampler(
sampling_strategy=self.sampling_strategy,
replacement=self.replacement,
ratio=self.ratio)), ('classifier', base_estimator)])
self.base_estimator_ = Pipeline([
('sampler', RandomUnderSampler(
sampling_strategy=self.sampling_strategy,
replacement=self.replacement,
ratio=self.ratio)),
('classifier', base_estimator)])

def fit(self, X, y):
"""Build a Bagging ensemble of estimators from the training
Expand All @@ -248,6 +250,5 @@ def fit(self, X, y):
self : object
Returns self.
"""
# RandomUnderSampler is not supporting sample_weight. We need to pass
# None.
# Pipeline does not support sample_weight
return self._fit(X, y, self.max_samples, sample_weight=None)
2 changes: 1 addition & 1 deletion imblearn/ensemble/_balance_cascade.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def _validate_estimator(self):

self.logger.debug(self.estimator_)

def _fit_resample(self, X, y):
def _fit_resample(self, X, y, sample_weight=None):
self._validate_estimator()

self.sampling_strategy_ = check_sampling_strategy(
Expand Down
2 changes: 1 addition & 1 deletion imblearn/ensemble/_easy_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def __init__(self,
self.replacement = replacement
self.n_subsets = n_subsets

def _fit_resample(self, X, y):
def _fit_resample(self, X, y, sample_weight=None):
random_state = check_random_state(self.random_state)

X_resampled = []
Expand Down
17 changes: 12 additions & 5 deletions imblearn/over_sampling/_adasyn.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,14 @@ def _validate_estimator(self):
'n_neighbors', self.n_neighbors, additional_neighbor=1)
self.nn_.set_params(**{'n_jobs': self.n_jobs})

def _fit_resample(self, X, y):
def _fit_resample(self, X, y, sample_weight=None):
self._validate_estimator()
random_state = check_random_state(self.random_state)

X_resampled = X.copy()
y_resampled = y.copy()
if sample_weight is not None:
sample_weight_resampled = sample_weight.copy()

for class_sample, n_samples in self.sampling_strategy_.items():
if n_samples == 0:
Expand Down Expand Up @@ -165,8 +167,6 @@ def _fit_resample(self, X, y):
X_new = (sparse.csr_matrix(
(samples, (row_indices, col_indices)),
[np.sum(n_samples_generate), X.shape[1]], dtype=X.dtype))
y_new = np.array([class_sample] * np.sum(n_samples_generate),
dtype=y.dtype)
else:
x_class_gen = []
for x_i, x_i_nn, num_sample_i in zip(X_class, nn_index,
Expand All @@ -182,13 +182,20 @@ def _fit_resample(self, X, y):
])

X_new = np.concatenate(x_class_gen).astype(X.dtype)
y_new = np.array([class_sample] * np.sum(n_samples_generate),
dtype=y.dtype)

y_new = np.array([class_sample] * np.sum(n_samples_generate),
dtype=y.dtype)
if sample_weight is not None:
sample_weight_resampled = np.hstack(
(sample_weight_resampled,
np.ones_like(y_new, dtype=sample_weight.dtype)))

if sparse.issparse(X_new):
X_resampled = sparse.vstack([X_resampled, X_new])
else:
X_resampled = np.vstack((X_resampled, X_new))
y_resampled = np.hstack((y_resampled, y_new))

if sample_weight is not None:
return X_resampled, y_resampled, sample_weight_resampled
return X_resampled, y_resampled
13 changes: 7 additions & 6 deletions imblearn/over_sampling/_random_over_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def _check_X_y(X, y):
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None)
return X, y, binarize_y

def _fit_resample(self, X, y):
def _fit_resample(self, X, y, sample_weight=None):
random_state = check_random_state(self.random_state)
target_stats = Counter(y)

Expand All @@ -102,9 +102,10 @@ def _fit_resample(self, X, y):
sample_indices = np.append(sample_indices,
target_class_indices[indices])

resampled_arrays = [safe_indexing(arr, sample_indices)
for arr in (X, y, sample_weight)
if arr is not None]

if self.return_indices:
return (safe_indexing(X, sample_indices), safe_indexing(
y, sample_indices), sample_indices)
else:
return (safe_indexing(X, sample_indices), safe_indexing(
y, sample_indices))
return tuple(resampled_arrays + [sample_indices])
return tuple(resampled_arrays)
Loading