scikit-learn-contrib · MattEding · Sep 16, 2019 · Sep 18, 2019 · Sep 18, 2019 · Sep 18, 2019
diff --git a/examples/combine/plot_illustration_spider.py b/examples/combine/plot_illustration_spider.py
@@ -0,0 +1,256 @@
+"""
+==========================================================================
+Illustration of the sample selection for the different SPIDER algorithms
+==========================================================================
+
+This example illustrates the different ways of resampling with SPIDER.
+
+"""
+
+# Authors: Matthew Eding
+# License: MIT
+
+from collections import namedtuple
+from functools import partial
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from imblearn.combine import SPIDER
+from matplotlib.patches import Circle
+from scipy.stats import mode
+
+print(__doc__)
+
+###############################################################################
+# These are helper functions for plotting aspects of the algorithm
+
+Neighborhood = namedtuple('Neighborhood', 'radius, neighbors')
+
+
+def plot_X(X, ax, **kwargs):
+    ax.scatter(X[:, 0], X[:, 1], **kwargs)
+
+
+def correct(nn, y_fit, X, y, additional=False):
+    n_neighbors = nn.n_neighbors
+    if additional:
+        n_neighbors += 2
+    nn_idxs = nn.kneighbors(X, n_neighbors, return_distance=False)[:, 1:]
+    y_pred, _ = mode(y_fit[nn_idxs], axis=1)
+    return (y == y_pred.ravel())
+
+
+def get_neighborhoods(spider, X_fit, y_fit, X_flagged, y_flagged, idx):
+    point = X_flagged[idx]
+
+    additional = (spider.kind_sel_sel == 'strong')
+    if correct(spider.nn_, y_fit, point[np.newaxis],
+               y_flagged[idx][np.newaxis], additional=additional):
+        additional = False
+
+    idxs_k = spider._locate_neighbors(point[np.newaxis])
+    neighbors_k = X_fit[idxs_k].squeeze()
+    farthest_k = neighbors_k[-1]
+    radius_k = np.linalg.norm(point - farthest_k)
+    neighborhood_k = Neighborhood(radius_k, neighbors_k)
+
+    idxs_k2 = spider._locate_neighbors(point[np.newaxis], additional=True)
+    neighbors_k2 = X_fit[idxs_k2].squeeze()
+    farthest_k2 = neighbors_k2[-1]
+    radius_k2 = np.linalg.norm(point - farthest_k2)
+    neighborhood_k2 = Neighborhood(radius_k2, neighbors_k2)
+
+    return neighborhood_k, neighborhood_k2, point, additional
+
+
+def draw_neighborhoods(spider, neighborhood_k, neighborhood_k2, point,
+                       additional, ax, outer=True, alpha=0.5):
+    PartialCircle = partial(Circle, facecolor='none', edgecolor='black',
+                            alpha=alpha)
+
+    circle_k = PartialCircle(point, neighborhood_k.radius, linestyle='-')
+
+    circle_k2 = PartialCircle(point, neighborhood_k2.radius,
+                              linestyle=('-' if additional else '--'))
+
+    if not additional:
+        ax.add_patch(circle_k)
+
+    if (spider.kind_sel == 'strong') and outer:
+        ax.add_patch(circle_k2)
+
+
+def draw_amplification(X_flagged, point, neighbors, ax):
+    for neigh in neighbors:
+        arr = np.vstack([point, neigh])
+        xs, ys = np.split(arr, 2, axis=1)
+        linestyle = 'solid' if neigh in X_flagged else 'dotted'
+        ax.plot(xs, ys, color='black', linestyle=linestyle)
+
+
+def plot_spider(kind_sel, X, y):
+    if kind_sel == 'strong':
+        _, axes = plt.subplots(2, 1, figsize=(12, 16))
+    else:
+        _, axes = plt.subplots(1, 1, figsize=(12, 8))
+        axes = np.atleast_1d(axes)
+
+    spider = SPIDER(kind_sel=kind_sel)
+    spider.fit_resample(X, y)
+
+    is_safe = correct(spider.nn_, y, X, y)
+    is_minor = (y == 1)
+
+    X_major = X[~is_minor]
+    X_minor = X[is_minor]
+    X_noise = X[~is_safe]
+
+    X_minor_noise = X[is_minor & ~is_safe]
+    y_minor_noise = y[is_minor & ~is_safe]
+    X_major_safe = X[~is_minor & is_safe]
+    X_minor_safe = X[is_minor & is_safe]
+    y_minor_safe = y[is_minor & is_safe]
+
+    partial_neighborhoods = partial(get_neighborhoods, spider, X, y)
+    partial_amplification = partial(draw_amplification, X_major_safe)
+    partial_draw_neighborhoods = partial(draw_neighborhoods, spider)
+
+    size = 500
+    for axis in axes:
+        plot_X(X_minor, ax=axis, label='Minority class', s=size, marker='_')
+        plot_X(X_major, ax=axis, label='Minority class', s=size, marker='+')
+
+        #: Overlay ring around noisy samples for both classes
+        plot_X(X_noise, ax=axis, label='Noisy Sample', s=size, marker='o',
+               facecolors='none', edgecolors='black')
+
+    #: Neighborhoods for Noisy Minority Samples
+    for idx in range(len(X_minor_noise)):
+        neighborhoods = partial_neighborhoods(X_minor_noise, y_minor_noise,
+                                              idx=idx)
+        partial_draw_neighborhoods(*neighborhoods, ax=axes[0],
+                                   outer=(spider.kind_sel == 'strong'))
+        neigh_k, neigh_k2, point, additional = neighborhoods
+        neighbors = neigh_k2.neighbors if additional else neigh_k.neighbors
+        partial_amplification(point, neighbors, ax=axes[0])
+
+    axes[0].axis('equal')
+    axes[0].legend(markerscale=0.5)
+    axes[0].set_title(f'SPIDER-{spider.kind_sel.title()}')
+
+    #: Neighborhoods for Safe Minority Samples (kind_sel='strong' only)
+    if spider.kind_sel == 'strong':
+        for idx in range(len(X_minor_safe)):
+            neighborhoods = partial_neighborhoods(X_minor_safe, y_minor_safe,
+                                                  idx=idx)
+            neigh_k, _, point, additional = neighborhoods
+            neighbors = neigh_k.neighbors
+            draw_flag = np.any(np.isin(neighbors, X_major_safe))
+
+            alpha = 0.5 if draw_flag else 0.1
+            partial_draw_neighborhoods(*neighborhoods[:-1], additional=False,
+                                       ax=axes[1], outer=False, alpha=alpha)
+
+            if draw_flag:
+                partial_amplification(point, neighbors, ax=axes[1])
+
+            axes[1].axis('equal')
+            axes[1].legend(markerscale=0.5)
+            axes[1].set_title(f'SPIDER-{spider.kind_sel.title()}')
+
+
+###############################################################################
+# We can start by generating some data to later illustrate the principle of
+# each SPIDER heuritic rules.
+
+X = np.array([
+    [-11.83, -6.81],
+    [-11.72, -2.34],
+    [-11.43, -5.85],
+    [-10.66, -4.33],
+    [-9.64, -7.05],
+    [-8.39, -4.41],
+    [-8.07, -5.66],
+    [-7.28, 0.91],
+    [-7.24, -2.41],
+    [-6.13, -4.81],
+    [-5.92, -6.81],
+    [-4., -1.81],
+    [-3.96, 2.67],
+    [-3.74, -7.31],
+    [-2.96, 4.69],
+    [-1.56, -2.33],
+    [-1.02, -4.57],
+    [0.46, 4.07],
+    [1.2, -1.53],
+    [1.32, 0.41],
+    [1.56, -5.19],
+    [2.52, 5.89],
+    [3.03, -4.15],
+    [4., -0.59],
+    [4.4, 2.07],
+    [4.41, -7.45],
+    [4.45, -4.12],
+    [5.13, -6.28],
+    [5.4, -5],
+    [6.26, 4.65],
+    [7.02, -6.22],
+    [7.5, -0.11],
+    [8.1, -2.05],
+    [8.42, 2.47],
+    [9.62, 3.87],
+    [10.54, -4.47],
+    [11.42, 0.01]
+])
+
+y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
+              0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0])
+
+
+###############################################################################
+# SPIDER-Weak / SPIDER-Relabel
+###############################################################################
+
+###############################################################################
+# Both SPIDER-Weak and SPIDER-Relabel start by labeling whether samples are
+# 'safe' or 'noisy' by looking at each point's 3-NN and seeing if it would be
+# classified correctly using KNN classification. For each minority-noisy
+# sample, we amplify it by the number of majority-safe samples in its 3-NN. In
+# the diagram below, the amplification amount is indicated by the number of
+# solid lines for a given minority-noisy sample's neighborhood.
+#
+# We can observe that the leftmost minority-noisy sample will be duplicated 3
+# times, the middle one 1 time, and the rightmost one will not be amplified.
+#
+# Then if SPIDER-Weak, every majority-noisy sample is removed from the dataset.
+# Othewise if SPIDER-Relabel, we relabel their class to be the minority class
+# instead. These would be the samples indicated by a circled plus-sign.
+
+plot_spider('weak', X, y)
+
+###############################################################################
+# SPIDER-Strong
+###############################################################################
+
+###############################################################################
+# SPIDER-Strong still uses 3-NN to classify samples as 'safe' or 'noisy' as the
+# first step. However for the amplification step, each minority-noisy sample
+# looks at its 5-NN, and if the larger neighborhood still misclassifies the
+# sample, the 5-NN is used to amplify. Otherwise if the sample is correctly
+# classified with 5-NN, the regular 3-NN is used to amplify.
+#
+# In the diagram below, we can see that the left/rightmost minority-noisy
+# samples are misclassified using 5-NN and will be amplified by 5 and 1
+# respectively. The middle minority-noisy sample is classified correctly by
+# using 5-NN, so amplification will be done using 3-NN.
+#
+# Next for each minority-safe sample, the amplification process is applied
+# using 3-NN. In the lower subplot, all but one of these samples will not be
+# amplified since they do not have majority-safe samples in their
+# neighborhoods. The one minority-safe sample to be amplified is indicated in a
+# darker neighborhood with lines.
+
+plot_spider('strong', X, y)
+
+plt.show()
diff --git a/imblearn/combine/__init__.py b/imblearn/combine/__init__.py
@@ -4,5 +4,10 @@
 
 from ._smote_enn import SMOTEENN
 from ._smote_tomek import SMOTETomek
+from ._preprocess import SPIDER
 
-__all__ = ["SMOTEENN", "SMOTETomek"]
+__all__ = [
+    "SMOTEENN",
+    "SMOTETomek",
+    "SPIDER",
+]
diff --git a/imblearn/combine/_preprocess/__init__.py b/imblearn/combine/_preprocess/__init__.py
@@ -0,0 +1,3 @@
+from ._spider import SPIDER
+
+__all__ = ["SPIDER"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from ._spider import SPIDER

		__all__ = ["SPIDER"]