ENH add GeometricSMOTE implementation (scikit-learn-contrib#881)

joaopfonseca · Dec 17, 2021 · 7e34701 · 7e34701
1 parent f1abf75
commit 7e34701
Show file tree

Hide file tree

Showing 4 changed files with 538 additions and 0 deletions.
diff --git a/imblearn/over_sampling/__init__.py b/imblearn/over_sampling/__init__.py
@@ -8,6 +8,7 @@
 from ._smote import SMOTE
 from ._smote import BorderlineSMOTE
 from ._smote import KMeansSMOTE
+from ._smote import GeometricSMOTE
 from ._smote import SVMSMOTE
 from ._smote import SMOTENC
 from ._smote import SMOTEN
@@ -16,6 +17,7 @@
     "ADASYN",
     "RandomOverSampler",
     "KMeansSMOTE",
+    "GeometricSMOTE",
     "SMOTE",
     "BorderlineSMOTE",
     "SVMSMOTE",

diff --git a/imblearn/over_sampling/_smote/__init__.py b/imblearn/over_sampling/_smote/__init__.py
@@ -4,6 +4,8 @@
 
 from .cluster import KMeansSMOTE
 
+from .geometric import GeometricSMOTE
+
 from .filter import BorderlineSMOTE
 from .filter import SVMSMOTE
 
@@ -12,6 +14,7 @@
     "SMOTEN",
     "SMOTENC",
     "KMeansSMOTE",
+    "GeometricSMOTE",
     "BorderlineSMOTE",
     "SVMSMOTE",
 ]
diff --git a/imblearn/over_sampling/_smote/geometric.py b/imblearn/over_sampling/_smote/geometric.py
@@ -0,0 +1,324 @@
+"""Class to perform over-sampling using Geometric SMOTE."""
+
+# Author: Georgios Douzas <[email protected]>
+# License: BSD 3 clause
+
+import numpy as np
+from numpy.linalg import norm
+from sklearn.utils import check_random_state
+from imblearn.over_sampling.base import BaseOverSampler
+from imblearn.utils import check_neighbors_object, Substitution
+from imblearn.utils._docstring import _random_state_docstring
+
+SELECTION_STRATEGY = ('combined', 'majority', 'minority')
+
+
+def _make_geometric_sample(
+    center, surface_point, truncation_factor, deformation_factor, random_state
+):
+    """A support function that returns an artificial point inside
+    the geometric region defined by the center and surface points.
+
+    Parameters
+    ----------
+    center : ndarray, shape (n_features, )
+        Center point of the geometric region.
+
+    surface_point : ndarray, shape (n_features, )
+        Surface point of the geometric region.
+
+    truncation_factor : float, optional (default=0.0)
+        The type of truncation. The values should be in the [-1.0, 1.0] range.
+
+    deformation_factor : float, optional (default=0.0)
+        The type of geometry. The values should be in the [0.0, 1.0] range.
+
+    random_state : int, RandomState instance or None
+        Control the randomization of the algorithm.
+
+    Returns
+    -------
+    point : ndarray, shape (n_features, )
+            Synthetically generated sample.
+
+    """
+
+    # Zero radius case
+    if np.array_equal(center, surface_point):
+        return center
+
+    # Generate a point on the surface of a unit hyper-sphere
+    radius = norm(center - surface_point)
+    normal_samples = random_state.normal(size=center.size)
+    point_on_unit_sphere = normal_samples / norm(normal_samples)
+    point = (random_state.uniform(size=1) ** (1 / center.size)) * point_on_unit_sphere
+
+    # Parallel unit vector
+    parallel_unit_vector = (surface_point - center) / norm(surface_point - center)
+
+    # Truncation
+    close_to_opposite_boundary = (
+        truncation_factor > 0
+        and np.dot(point, parallel_unit_vector) < truncation_factor - 1
+    )
+    close_to_boundary = (
+        truncation_factor < 0
+        and np.dot(point, parallel_unit_vector) > truncation_factor + 1
+    )
+    if close_to_opposite_boundary or close_to_boundary:
+        point -= 2 * np.dot(point, parallel_unit_vector) * parallel_unit_vector
+
+    # Deformation
+    parallel_point_position = np.dot(point, parallel_unit_vector) * parallel_unit_vector
+    perpendicular_point_position = point - parallel_point_position
+    point = (
+        parallel_point_position
+        + (1 - deformation_factor) * perpendicular_point_position
+    )
+
+    # Translation
+    point = center + radius * point
+
+    return point
+
+
+@Substitution(
+    sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
+    random_state=_random_state_docstring,
+)
+class GeometricSMOTE(BaseOverSampler):
+    """Class to to perform over-sampling using Geometric SMOTE.
+
+    This algorithm is an implementation of Geometric SMOTE, a geometrically
+    enhanced drop-in replacement for SMOTE as presented in [1]_.
+
+    Read more in the :ref:`User Guide <user_guide>`.
+
+    Parameters
+    ----------
+    {sampling_strategy}
+
+    {random_state}
+
+    truncation_factor : float, optional (default=0.0)
+        The type of truncation. The values should be in the [-1.0, 1.0] range.
+
+    deformation_factor : float, optional (default=0.0)
+        The type of geometry. The values should be in the [0.0, 1.0] range.
+
+    selection_strategy : str, optional (default='combined')
+        The type of Geometric SMOTE algorithm with the following options:
+        ``'combined'``, ``'majority'``, ``'minority'``.
+
+    k_neighbors : int or object, optional (default=5)
+        If ``int``, number of nearest neighbours to use when synthetic
+        samples are constructed for the minority method.  If object, an estimator
+        that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that
+        will be used to find the k_neighbors.
+
+    n_jobs : int, optional (default=1)
+        The number of threads to open if possible.
+
+    Notes
+    -----
+    See the original paper: [1]_ for more details.
+
+    Supports multi-class resampling. A one-vs.-rest scheme is used as
+    originally proposed in [2]_.
+
+    References
+    ----------
+
+    .. [1] G. Douzas, F. Bacao, "Geometric SMOTE:
+       a geometrically enhanced drop-in replacement for SMOTE",
+       Information Sciences, vol. 501, pp. 118-135, 2019.
+
+    .. [2] N. V. Chawla, K. W. Bowyer, L. O. Hall, W. P. Kegelmeyer, "SMOTE:
+       synthetic minority over-sampling technique", Journal of Artificial
+       Intelligence Research, vol. 16, pp. 321-357, 2002.
+
+    Examples
+    --------
+
+    >>> from collections import Counter
+    >>> from sklearn.datasets import make_classification
+    >>> from gsmote import GeometricSMOTE # doctest: +NORMALIZE_WHITESPACE
+    >>> X, y = make_classification(n_classes=2, class_sep=2,
+    ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
+    ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
+    >>> print('Original dataset shape %s' % Counter(y))
+    Original dataset shape Counter({{1: 900, 0: 100}})
+    >>> gsmote = GeometricSMOTE(random_state=1)
+    >>> X_res, y_res = gsmote.fit_resample(X, y)
+    >>> print('Resampled dataset shape %s' % Counter(y_res))
+    Resampled dataset shape Counter({{0: 900, 1: 900}})
+
+    """
+
+    def __init__(
+        self,
+        sampling_strategy='auto',
+        random_state=None,
+        truncation_factor=1.0,
+        deformation_factor=0.0,
+        selection_strategy='combined',
+        k_neighbors=5,
+        n_jobs=1,
+    ):
+        super(GeometricSMOTE, self).__init__(sampling_strategy=sampling_strategy)
+        self.random_state = random_state
+        self.truncation_factor = truncation_factor
+        self.deformation_factor = deformation_factor
+        self.selection_strategy = selection_strategy
+        self.k_neighbors = k_neighbors
+        self.n_jobs = n_jobs
+
+    def _validate_estimator(self):
+        """Create the necessary attributes for Geometric SMOTE."""
+
+        # Check random state
+        self.random_state_ = check_random_state(self.random_state)
+
+        # Validate strategy
+        if self.selection_strategy not in SELECTION_STRATEGY:
+            error_msg = (
+                'Unknown selection_strategy for Geometric SMOTE algorithm. '
+                'Choices are {}. Got {} instead.'
+            )
+            raise ValueError(
+                error_msg.format(SELECTION_STRATEGY, self.selection_strategy)
+            )
+
+        # Create nearest neighbors object for positive class
+        if self.selection_strategy in ('minority', 'combined'):
+            self.nns_pos_ = check_neighbors_object(
+                'nns_positive', self.k_neighbors, additional_neighbor=1
+            )
+            self.nns_pos_.set_params(n_jobs=self.n_jobs)
+
+        # Create nearest neighbors object for negative class
+        if self.selection_strategy in ('majority', 'combined'):
+            self.nn_neg_ = check_neighbors_object('nn_negative', nn_object=1)
+            self.nn_neg_.set_params(n_jobs=self.n_jobs)
+
+    def _make_geometric_samples(self, X, y, pos_class_label, n_samples):
+        """A support function that returns an artificials samples inside
+        the geometric region defined by nearest neighbors.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Matrix containing the data which have to be sampled.
+        y : array-like, shape (n_samples, )
+            Corresponding label for each sample in X.
+        pos_class_label : str or int
+            The minority class (positive class) target value.
+        n_samples : int
+            The number of samples to generate.
+
+        Returns
+        -------
+        X_new : ndarray, shape (n_samples_new, n_features)
+            Synthetically generated samples.
+        y_new : ndarray, shape (n_samples_new, )
+            Target values for synthetic samples.
+
+        """
+
+        # Return zero new samples
+        if n_samples == 0:
+            return (
+                np.array([], dtype=X.dtype).reshape(0, X.shape[1]),
+                np.array([], dtype=y.dtype),
+            )
+
+        # Select positive class samples
+        X_pos = X[y == pos_class_label]
+
+        # Force minority strategy if no negative class samples are present
+        self.selection_strategy_ = (
+            'minority' if len(X) == len(X_pos) else self.selection_strategy
+        )
+
+        # Minority or combined strategy
+        if self.selection_strategy_ in ('minority', 'combined'):
+            self.nns_pos_.fit(X_pos)
+            points_pos = self.nns_pos_.kneighbors(X_pos)[1][:, 1:]
+            samples_indices = self.random_state_.randint(
+                low=0, high=len(points_pos.flatten()), size=n_samples
+            )
+            rows = np.floor_divide(samples_indices, points_pos.shape[1])
+            cols = np.mod(samples_indices, points_pos.shape[1])
+
+        # Majority or combined strategy
+        if self.selection_strategy_ in ('majority', 'combined'):
+            X_neg = X[y != pos_class_label]
+            self.nn_neg_.fit(X_neg)
+            points_neg = self.nn_neg_.kneighbors(X_pos)[1]
+            if self.selection_strategy_ == 'majority':
+                samples_indices = self.random_state_.randint(
+                    low=0, high=len(points_neg.flatten()), size=n_samples
+                )
+                rows = np.floor_divide(samples_indices, points_neg.shape[1])
+                cols = np.mod(samples_indices, points_neg.shape[1])
+
+        # Generate new samples
+        X_new = np.zeros((n_samples, X.shape[1]))
+        for ind, (row, col) in enumerate(zip(rows, cols)):
+
+            # Define center point
+            center = X_pos[row]
+
+            # Minority strategy
+            if self.selection_strategy_ == 'minority':
+                surface_point = X_pos[points_pos[row, col]]
+
+            # Majority strategy
+            elif self.selection_strategy_ == 'majority':
+                surface_point = X_neg[points_neg[row, col]]
+
+            # Combined strategy
+            else:
+                surface_point_pos = X_pos[points_pos[row, col]]
+                surface_point_neg = X_neg[points_neg[row, 0]]
+                radius_pos = norm(center - surface_point_pos)
+                radius_neg = norm(center - surface_point_neg)
+                surface_point = (
+                    surface_point_neg if radius_pos > radius_neg else surface_point_pos
+                )
+
+            # Append new sample
+            X_new[ind] = _make_geometric_sample(
+                center,
+                surface_point,
+                self.truncation_factor,
+                self.deformation_factor,
+                self.random_state_,
+            )
+
+        # Create new samples for target variable
+        y_new = np.array([pos_class_label] * len(samples_indices))
+
+        return X_new, y_new
+
+    def _fit_resample(self, X, y):
+
+        # Validate estimator's parameters
+        self._validate_estimator()
+
+        # Copy data
+        X_resampled, y_resampled = X.copy(), y.copy()
+
+        # Resample data
+        for class_label, n_samples in self.sampling_strategy_.items():
+
+            # Apply gsmote mechanism
+            X_new, y_new = self._make_geometric_samples(X, y, class_label, n_samples)
+
+            # Append new data
+            X_resampled, y_resampled = (
+                np.vstack((X_resampled, X_new)),
+                np.hstack((y_resampled, y_new)),
+            )
+
+        return X_resampled, y_resampled