scikit-learn-contrib · glemaitre · Aug 27, 2018 · Aug 27, 2018 · Aug 27, 2018 · Aug 27, 2018
diff --git a/examples/over-sampling/plot_comparison_over_sampling.py b/examples/over-sampling/plot_comparison_over_sampling.py
@@ -134,7 +134,7 @@ class FakeSampler(BaseSampler):
 
     _sampling_type = 'bypass'
 
-    def _fit_resample(self, X, y):
+    def _fit_resample(self, X, y, sample_weight=None):
         return X, y
 
 

diff --git a/imblearn/base.py b/imblearn/base.py
@@ -16,6 +16,8 @@
 from sklearn.externals import six
 from sklearn.preprocessing import label_binarize
 from sklearn.utils import check_X_y
+from sklearn.utils import check_consistent_length
+from sklearn.utils import check_array
 
 from .utils import check_sampling_strategy, check_target_type
 from .utils.deprecation import deprecate_parameter
@@ -55,7 +57,7 @@ def fit(self, X, y):
             self.sampling_strategy, y, self._sampling_type)
         return self
 
-    def fit_resample(self, X, y):
+    def fit_resample(self, X, y, sample_weight=None):
         """Resample the dataset.
 
         Parameters
@@ -66,24 +68,39 @@ def fit_resample(self, X, y):
         y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
+        sample_weight : array-like, shape (n_samples,) or None
+            Sample weights.
+
+
         Returns
         -------
-        X_resampled : {array-like, sparse matrix}, shape \
+        X_resampled : {ndarray, sparse matrix}, shape \
 (n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : array-like, shape (n_samples_new,)
+        y_resampled : ndarray, shape (n_samples_new,)
             The corresponding label of `X_resampled`.
 
+        sample_weight_resampled : ndarray, shape (n_samples_new,)
+            Resampled sample weights. This output is returned only if
+            ``sample_weight`` was not ``None``.
+
+        idx_resampled : ndarray, shape (n_samples_new,)
+            Indices of the selected features. This output is optional and only
+            available for some sampler if ``return_indices=True``.
+
         """
         self._deprecate_ratio()
 
         X, y, binarize_y = self._check_X_y(X, y)
+        if sample_weight is not None:
+            sample_weight = check_array(sample_weight, ensure_2d=False)
+            check_consistent_length(X, y, sample_weight)
 
         self.sampling_strategy_ = check_sampling_strategy(
             self.sampling_strategy, y, self._sampling_type)
 
-        output = self._fit_resample(X, y)
+        output = self._fit_resample(X, y, sample_weight)
 
         if binarize_y:
             y_sampled = label_binarize(output[1], np.unique(y))
@@ -96,7 +113,7 @@ def fit_resample(self, X, y):
     fit_sample = fit_resample
 
     @abstractmethod
-    def _fit_resample(self, X, y):
+    def _fit_resample(self, X, y, sample_weight=None):
         """Base method defined in each sampler to defined the sampling
         strategy.
 
@@ -108,14 +125,25 @@ def _fit_resample(self, X, y):
         y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
+        sample_weight : array-like, shape (n_samples,) or None
+            Sample weights.
+
         Returns
         -------
         X_resampled : {ndarray, sparse matrix}, shape \
 (n_samples_new, n_features)
             The array containing the resampled data.
 
         y_resampled : ndarray, shape (n_samples_new,)
-            The corresponding label of `X_resampled`
+            The corresponding label of `X_resampled`.
+
+        sample_weight_resampled : ndarray, shape (n_samples_new,)
+            Resampled sample weights. This output is returned only if
+            ``sample_weight`` was not ``None``.
+
+        idx_resampled : ndarray, shape (n_samples_new,)
+            Indices of the selected features. This output is optional and only
+            available for some sampler if ``return_indices=True``.
 
         """
         pass
@@ -243,7 +271,7 @@ def __init__(self, func=None, accept_sparse=True, kw_args=None):
         self.kw_args = kw_args
         self.logger = logging.getLogger(__name__)
 
-    def _fit_resample(self, X, y):
+    def _fit_resample(self, X, y, sample_weight=None):
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']
                          if self.accept_sparse else False)
         func = _identity if self.func is None else self.func

diff --git a/imblearn/combine/_smote_enn.py b/imblearn/combine/_smote_enn.py
@@ -125,11 +125,11 @@ def _validate_estimator(self):
         else:
             self.enn_ = EditedNearestNeighbours(sampling_strategy='all')
 
-    def _fit_resample(self, X, y):
+    def _fit_resample(self, X, y, sample_weight=None):
         self._validate_estimator()
         y = check_target_type(y)
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
         self.sampling_strategy_ = self.sampling_strategy
 
-        X_res, y_res = self.smote_.fit_resample(X, y)
-        return self.enn_.fit_resample(X_res, y_res)
+        resampled_arrays = self.smote_.fit_resample(X, y, sample_weight)
+        return self.enn_.fit_resample(*resampled_arrays)
diff --git a/imblearn/combine/_smote_tomek.py b/imblearn/combine/_smote_tomek.py
@@ -134,11 +134,11 @@ def _validate_estimator(self):
         else:
             self.tomek_ = TomekLinks(sampling_strategy='all')
 
-    def _fit_resample(self, X, y):
+    def _fit_resample(self, X, y, sample_weight=None):
         self._validate_estimator()
         y = check_target_type(y)
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
         self.sampling_strategy_ = self.sampling_strategy
 
-        X_res, y_res = self.smote_.fit_resample(X, y)
-        return self.tomek_.fit_resample(X_res, y_res)
+        resampled_arrays = self.smote_.fit_resample(X, y, sample_weight)
+        return self.tomek_.fit_resample(*resampled_arrays)
diff --git a/imblearn/ensemble/_bagging.py b/imblearn/ensemble/_bagging.py
@@ -210,7 +210,7 @@ def __init__(self,
         self.ratio = ratio
         self.replacement = replacement
 
-    def _validate_estimator(self, default=DecisionTreeClassifier()):
+    def _validate_estimator(self):
         """Check the estimator and the n_estimator attribute, set the
         `base_estimator_` attribute."""
         if not isinstance(self.n_estimators, (numbers.Integral, np.integer)):
@@ -224,12 +224,14 @@ def _validate_estimator(self, default=DecisionTreeClassifier()):
         if self.base_estimator is not None:
             base_estimator = clone(self.base_estimator)
         else:
-            base_estimator = clone(default)
+            base_estimator = clone(DecisionTreeClassifier())
 
-        self.base_estimator_ = Pipeline([('sampler', RandomUnderSampler(
-            sampling_strategy=self.sampling_strategy,
-            replacement=self.replacement,
-            ratio=self.ratio)), ('classifier', base_estimator)])
+        self.base_estimator_ = Pipeline([
+            ('sampler', RandomUnderSampler(
+                sampling_strategy=self.sampling_strategy,
+                replacement=self.replacement,
+                ratio=self.ratio)),
+            ('classifier', base_estimator)])
 
     def fit(self, X, y):
         """Build a Bagging ensemble of estimators from the training
@@ -248,6 +250,5 @@ def fit(self, X, y):
         self : object
             Returns self.
         """
-        # RandomUnderSampler is not supporting sample_weight. We need to pass
-        # None.
+        # Pipeline does not support sample_weight
         return self._fit(X, y, self.max_samples, sample_weight=None)
diff --git a/imblearn/ensemble/_balance_cascade.py b/imblearn/ensemble/_balance_cascade.py
@@ -128,7 +128,7 @@ def _validate_estimator(self):
 
         self.logger.debug(self.estimator_)
 
-    def _fit_resample(self, X, y):
+    def _fit_resample(self, X, y, sample_weight=None):
         self._validate_estimator()
 
         self.sampling_strategy_ = check_sampling_strategy(

diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py
@@ -114,7 +114,7 @@ def __init__(self,
         self.replacement = replacement
         self.n_subsets = n_subsets
 
-    def _fit_resample(self, X, y):
+    def _fit_resample(self, X, y, sample_weight=None):
         random_state = check_random_state(self.random_state)
 
         X_resampled = []

diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py
@@ -106,12 +106,14 @@ def _validate_estimator(self):
             'n_neighbors', self.n_neighbors, additional_neighbor=1)
         self.nn_.set_params(**{'n_jobs': self.n_jobs})
 
-    def _fit_resample(self, X, y):
+    def _fit_resample(self, X, y, sample_weight=None):
         self._validate_estimator()
         random_state = check_random_state(self.random_state)
 
         X_resampled = X.copy()
         y_resampled = y.copy()
+        if sample_weight is not None:
+            sample_weight_resampled = sample_weight.copy()
 
         for class_sample, n_samples in self.sampling_strategy_.items():
             if n_samples == 0:
@@ -165,8 +167,6 @@ def _fit_resample(self, X, y):
                 X_new = (sparse.csr_matrix(
                     (samples, (row_indices, col_indices)),
                     [np.sum(n_samples_generate), X.shape[1]], dtype=X.dtype))
-                y_new = np.array([class_sample] * np.sum(n_samples_generate),
-                                 dtype=y.dtype)
             else:
                 x_class_gen = []
                 for x_i, x_i_nn, num_sample_i in zip(X_class, nn_index,
@@ -182,13 +182,20 @@ def _fit_resample(self, X, y):
                     ])
 
                 X_new = np.concatenate(x_class_gen).astype(X.dtype)
-                y_new = np.array([class_sample] * np.sum(n_samples_generate),
-                                 dtype=y.dtype)
+
+            y_new = np.array([class_sample] * np.sum(n_samples_generate),
+                             dtype=y.dtype)
+            if sample_weight is not None:
+                sample_weight_resampled = np.hstack(
+                    (sample_weight_resampled,
+                     np.ones_like(y_new, dtype=sample_weight.dtype)))
 
             if sparse.issparse(X_new):
                 X_resampled = sparse.vstack([X_resampled, X_new])
             else:
                 X_resampled = np.vstack((X_resampled, X_new))
             y_resampled = np.hstack((y_resampled, y_new))
 
+        if sample_weight is not None:
+            return X_resampled, y_resampled, sample_weight_resampled
         return X_resampled, y_resampled
diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py
@@ -88,7 +88,7 @@ def _check_X_y(X, y):
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None)
         return X, y, binarize_y
 
-    def _fit_resample(self, X, y):
+    def _fit_resample(self, X, y, sample_weight=None):
         random_state = check_random_state(self.random_state)
         target_stats = Counter(y)
 
@@ -102,9 +102,10 @@ def _fit_resample(self, X, y):
             sample_indices = np.append(sample_indices,
                                        target_class_indices[indices])
 
+        resampled_arrays = [safe_indexing(arr, sample_indices)
+                            for arr in (X, y, sample_weight)
+                            if arr is not None]
+
         if self.return_indices:
-            return (safe_indexing(X, sample_indices), safe_indexing(
-                    y, sample_indices), sample_indices)
-        else:
-            return (safe_indexing(X, sample_indices), safe_indexing(
-                    y, sample_indices))
+            return tuple(resampled_arrays + [sample_indices])
+        return tuple(resampled_arrays)