From 3a9dece513b4eff18c8879aff05fdd25d4264ce8 Mon Sep 17 00:00:00 2001 From: bachwani Date: Thu, 22 Aug 2019 23:34:05 +0530 Subject: [PATCH 1/6] Added a few features --- yellowbrick/features/decomposition.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/yellowbrick/features/decomposition.py b/yellowbrick/features/decomposition.py index c679f1d5f..6c8000949 100644 --- a/yellowbrick/features/decomposition.py +++ b/yellowbrick/features/decomposition.py @@ -1,6 +1,8 @@ ########################################################################## ## Imports ########################################################################## +import numpy as np +import bisect from .base import FeatureVisualizer from yellowbrick.style import palettes @@ -90,7 +92,8 @@ class ExplainedVariance(FeatureVisualizer): """ def __init__(self, n_components=None, ax=None, scale=True, center=True, - colormap=palettes.DEFAULT_SEQUENCE, **kwargs): + colormap=palettes.DEFAULT_SEQUENCE, cumulative=False, cutoff=95, + **kwargs): super(ExplainedVariance, self).__init__(ax=ax, **kwargs) @@ -102,10 +105,16 @@ def __init__(self, n_components=None, ax=None, scale=True, center=True, with_std=self.scale)), ('pca', PCA(n_components=self.n_components))]) self.pca_features = None - + self.cumulative = cumulative + self.cutoff = cutoff + @property def explained_variance_(self): return self.pipeline.steps[-1][1].explained_variance_ + + @property + def explained_variance_ratio_(self): + return self.pipeline.steps[-1][1].explained_variance_ratio_ def fit(self, X, y=None): self.pipeline.fit(X) @@ -117,8 +126,17 @@ def transform(self, X): return self.pca_features def draw(self): - X = self.explained_variance_ - self.ax.plot(X) + X = self.explained_variance_ratio_ + self.ax.plot(X, label = "Explained Variance") + if (self.cumulative): + X = np.cumsum(self.explained_variance_ratio_) + self.ax.plot(X, label = "Cumulative Variance") + + n_comp = bisect.bisect_left(X, self.cutoff/100); + self.ax.vlines(n_comp, 0, X[n_comp], linestyle = "dashed", + label=str(self.cutoff)+"% Variance") + self.ax.hlines(X[n_comp], 0, n_comp, linestyle = "dashed") + return self.ax def finalize(self, **kwargs): @@ -128,4 +146,5 @@ def finalize(self, **kwargs): # Set the axes labels self.ax.set_ylabel('Explained Variance') self.ax.set_xlabel('Number of Components') + self.ax.legend(loc="center right") From 7ae23fedc9131c61523acdef28072c53cd0ded7f Mon Sep 17 00:00:00 2001 From: Benjamin Bengfort Date: Mon, 10 Feb 2020 19:41:15 -0500 Subject: [PATCH 2/6] explained variance stubs --- docs/api/features/explained_variance.rst | 54 ++++++++ docs/api/features/index.rst | 3 +- docs/index.rst | 3 +- .../test_features/test_explained_variance.py | 43 ++++++ yellowbrick/features/__init__.py | 1 + ...decomposition.py => explained_variance.py} | 129 ++++++++---------- yellowbrick/features/pca.py | 35 +---- 7 files changed, 167 insertions(+), 101 deletions(-) create mode 100644 docs/api/features/explained_variance.rst create mode 100644 tests/test_features/test_explained_variance.py rename yellowbrick/features/{decomposition.py => explained_variance.py} (62%) diff --git a/docs/api/features/explained_variance.rst b/docs/api/features/explained_variance.rst new file mode 100644 index 000000000..808272963 --- /dev/null +++ b/docs/api/features/explained_variance.rst @@ -0,0 +1,54 @@ +.. -*- mode: rst -*- + +Explained Variance +================== + +================= ================= +Visualizer :class:`~yellowbrick.features.explained_variance.ExplainedVariance` +Quick Method :func:`~yellowbrick.features.explained_variance.explained_variance` +Models Decomposition +Workflow Feature Engineering +================= ================= + +.. plot:: + :context: close-figs + :alt: Explained variance quick method on the credit dataset + + from yellowbrick.datasets import load_credit + from yellowbrick.features import ExplainedVariance + + # Specify the features of interest and the target + X, _ = load_credit() + + # Instantiate the visualizer, fit and transform the data + oz = ExplainedVariance() + oz.fit_transform(X) + oz.show() + + +Quick Method +------------ + +The same functionality above can be achieved with the associated quick method ``explained_variance``. This method will build the ``ExplainedVariance`` visualizer with the associated arguments, fit it, then (optionally) immediately show it. + +.. plot:: + :context: close-figs + :alt: Explained variance quick method on the concrete dataset + + from yellowbrick.datasets import load_concrete + from yellowbrick.features import explained_variance + + # Specify the features of interest and the target + X, _ = load_concrete() + + # Determine the optimal number of components + oz = explained_variance(X) + + +API Reference +------------- + +.. automodule:: yellowbrick.features.explained_variance + :members: ExplainedVariance, explained_variance + :undoc-members: + :show-inheritance: diff --git a/docs/api/features/index.rst b/docs/api/features/index.rst index 3dce89d93..1f32de088 100644 --- a/docs/api/features/index.rst +++ b/docs/api/features/index.rst @@ -47,6 +47,7 @@ finalizes and displays the image. radviz rankd pcoords - pca manifold + pca + explained_variance jointplot diff --git a/docs/index.rst b/docs/index.rst index 7ca743ad2..1ee0e8cfc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -56,8 +56,9 @@ Feature Visualization - :doc:`api/features/rankd`: pairwise ranking of features to detect relationships - :doc:`api/features/pcoords`: horizontal visualization of instances - :doc:`Radial Visualization `: separation of instances around a circular plot -- :doc:`api/features/pca`: projection of instances based on principal components - :doc:`api/features/manifold`: high dimensional visualization with manifold learning +- :doc:`api/features/pca`: projection of instances based on principal components +- :doc:`api/features/explained_variance`: select number of components for PCA - :doc:`Joint Plots `: direct data visualization with feature selection Classification Visualization diff --git a/tests/test_features/test_explained_variance.py b/tests/test_features/test_explained_variance.py new file mode 100644 index 000000000..acee543f7 --- /dev/null +++ b/tests/test_features/test_explained_variance.py @@ -0,0 +1,43 @@ +# tests.test_features.test_explained_variance +# Tests for the PCA explained variance visualizer +# +# Author: Benjamin Bengfort +# Created: Mon Feb 10 19:11:46 2020 -0500 +# +# Copyright (C) 2019 The scikit-yb developers +# For license information, see LICENSE.txt +# +# ID: test_explained_variance.py [] benjamin@bengfort.com $ + +""" +Tests for the PCA explained variance visualizer +""" + +########################################################################## +## Imports +########################################################################## + +from tests.base import VisualTestCase + +from yellowbrick.datasets import load_credit +from yellowbrick.features.explained_variance import * + + +########################################################################## +## ExplainedVariance Tests +########################################################################## + +class TextExplainedVariance(VisualTestCase): + """ + Test the explained variance visualizer + """ + + def test_quick_method(self): + """ + Test the explained variance quick method + """ + X, _ = load_credit() + oz = explained_variance(X) + + assert isinstance(oz, ExplainedVariance) + self.assert_images_similar(oz) diff --git a/yellowbrick/features/__init__.py b/yellowbrick/features/__init__.py index 7b7fc0fc2..c689f8e23 100644 --- a/yellowbrick/features/__init__.py +++ b/yellowbrick/features/__init__.py @@ -24,6 +24,7 @@ from .jointplot import JointPlot, JointPlotVisualizer, joint_plot from .pca import PCA, PCADecomposition, pca_decomposition from .manifold import Manifold, manifold_embedding +from .explained_variance import ExplainedVariance, explained_variance # Alias the TargetType defined in yellowbrick.utils.target from yellowbrick.utils.target import TargetType diff --git a/yellowbrick/features/decomposition.py b/yellowbrick/features/explained_variance.py similarity index 62% rename from yellowbrick/features/decomposition.py rename to yellowbrick/features/explained_variance.py index 5a4971911..12b356553 100644 --- a/yellowbrick/features/decomposition.py +++ b/yellowbrick/features/explained_variance.py @@ -1,12 +1,13 @@ -# yellowbrick.features.decomposition +# yellowbrick.features.explained_variance # # Author: George Richardson +# Author: Benjamin Bengfort # Created: Fri Mar 2 16:16:00 2018 +0000 # # Copyright (C) 2016 The scikit-yb developers # For license information, see LICENSE.txt # -# ID: decomposition.py [0ed6e8a] g.raymond.richardson@gmail.com $ +# ID: explained_variance.py [0ed6e8a] g.raymond.richardson@gmail.com $ ########################################################################## ## Imports @@ -19,73 +20,6 @@ from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler -########################################################################## -## Quick Methods -########################################################################## - - -def explained_variance_visualizer( - X, - y=None, - ax=None, - scale=True, - center=True, - colormap=palettes.DEFAULT_SEQUENCE, - **kwargs -): - """Produce a plot of the explained variance produced by a dimensionality - reduction algorithm using n=1 to n=n_components dimensions. This is a single - plot to help identify the best trade off between number of dimensions - and amount of information retained within the data. - - Parameters - ---------- - X : ndarray or DataFrame of shape n x m - A matrix of n rows with m features - - y : ndarray or Series of length n - An array or Series of target or class values - - ax : matplotlib Axes, default: None - The aces to plot the figure on - - scale : bool, default: True - Boolean that indicates if the values of X should be scaled. - - colormap : string or cmap, default: None - optional string or matplotlib cmap to colorize lines - Use either color to colorize the lines on a per class basis or - colormap to color them on a continuous scale. - - kwargs : dict - Keyword arguments that are passed to the base class and may influence - the visualization as defined in other Visualizers. - - Returns - ------- - viz : ExplainedVariance - Returns the fitted, finalized visualizer - - Examples - -------- - >>> from sklearn import datasets - >>> bc = datasets.load_breast_cancer() - >>> X = bc = bc.data - >>> explained_variance_visualizer(X, scale=True, center=True, colormap='RdBu_r') - - """ - - # Instantiate the visualizer - visualizer = ExplainedVariance(X=X) - - # Fit and transform the visualizer (calls draw) - visualizer.fit(X, y, **kwargs) - visualizer.transform(X) - visualizer.finalize() - - # Return the visualizer object - return visualizer - ########################################################################## ## Explained Variance Feature Visualizer @@ -172,3 +106,60 @@ def finalize(self, **kwargs): # Set the axes labels self.ax.set_ylabel("Explained Variance") self.ax.set_xlabel("Number of Components") + + +########################################################################## +## Quick Method +########################################################################## + +def explained_variance( + X, + y=None, + ax=None, + show=True, + **kwargs +): + """ExplainedVariance quick method. + + Parameters + ---------- + X : ndarray or DataFrame of shape n x m + A matrix of n instances with m features to determine principle components for. + + y : ndarray or Series of length n, default: None + An array or series of target or class values. This argument is not used but is + enabled for pipeline purposes. + + ax : matplotlib Axes, default: None + The axes to plot the figure on. If None is passed in, the current axes + will be used (or generated if required). + + show : bool, default: True + If True, calls ``show()``, which in turn calls ``plt.show()`` however you cannot + call ``plt.savefig`` from this signature, nor ``clear_figure``. If False, simply + calls ``finalize()`` + + kwargs : dict + Keyword arguments that are passed to the base class and may influence + the visualization as defined in other Visualizers. + + Returns + ------- + viz : ExplainedVariance + Returns the fitted, finalized visualizer + """ + + # Instantiate the visualizer + oz = ExplainedVariance() + + # Fit and transform the visualizer (calls draw) + oz.fit(X, y) + oz.transform(X) + + if show: + oz.show() + else: + oz.finalize() + + # Return the visualizer object + return oz diff --git a/yellowbrick/features/pca.py b/yellowbrick/features/pca.py index 6b09dfca5..64183ef56 100644 --- a/yellowbrick/features/pca.py +++ b/yellowbrick/features/pca.py @@ -471,8 +471,8 @@ def pca_decomposition( show=True, **kwargs ): + """PCA quick meethod. - """ Produce a two or three dimensional principal component plot of the data array ``X`` projected onto its largest sequential principal components. It is common practice to scale the data array ``X`` before applying a PC decomposition. Variable scaling @@ -557,35 +557,10 @@ def pca_decomposition( Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. - Attributes - ---------- - pca_components_ : ndarray, shape (n_features, n_components) - This tells about the magnitude of each feature in the pricipal components. - This is primarily used to draw the biplots. - - classes_ : ndarray, shape (n_classes,) - The class labels that define the discrete values in the target. Only - available if the target type is discrete. This is guaranteed to be - strings even if the classes are a different type. - - features_ : ndarray, shape (n_features,) - The names of the features discovered or used in the visualizer that - can be used as an index to access or modify data in X. If a user passes - feature names in, those features are used. Otherwise the columns of a - DataFrame are used or just simply the indices of the data array. - - range_ : (min y, max y) - A tuple that describes the minimum and maximum values in the target. - Only available if the target type is continuous. - - Examples - -------- - >>> from sklearn import datasets - >>> iris = datasets.load_iris() - >>> X = iris.data - >>> y = iris.target - >>> pca_decomposition(X, y, colors=['r', 'g', 'b'], projection=3) - + Returns + ------- + viz : PCA + Returns the fitted, finalized visualizer """ # Instantiate the visualizer visualizer = PCA( From 93730dd0bf5f34bc30bb00c47dc25ea9b70c9e8c Mon Sep 17 00:00:00 2001 From: Benjamin Bengfort Date: Tue, 25 Feb 2020 18:26:11 -0500 Subject: [PATCH 3/6] first prototype of explained variance --- yellowbrick/features/explained_variance.py | 258 ++++++++++++++++++--- 1 file changed, 220 insertions(+), 38 deletions(-) diff --git a/yellowbrick/features/explained_variance.py b/yellowbrick/features/explained_variance.py index 12b356553..151c8c9e1 100644 --- a/yellowbrick/features/explained_variance.py +++ b/yellowbrick/features/explained_variance.py @@ -13,12 +13,14 @@ ## Imports ########################################################################## -from yellowbrick.style import palettes -from yellowbrick.features.base import FeatureVisualizer +import bisect +import numpy as np from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA +from yellowbrick.exceptions import NotFitted from sklearn.preprocessing import StandardScaler +from yellowbrick.features.base import FeatureVisualizer ########################################################################## @@ -32,80 +34,260 @@ class ExplainedVariance(FeatureVisualizer): Parameters ---------- ax : matplotlib Axes, default: None - The aces to plot the figure on + The axes to plot the figure on. If None is passed in, the current axes + will be used (or generated if required). - scale : bool, default: True - Boolean that indicates if the values of X should be scaled. + transformer : PCA or Pipeline, default: None + By default the visualizer creates a PCA transformer with all components, scaling + the data on request. Users can submit their own transformer or pipeline to + visualize the explained variance for, so long as the transformer or the last + step in the pipeline has ``explained_variance_`` and + ``explained_variance_ratio_`` learned attributes after being fitted. - colormap : string or cmap, default: None - optional string or matplotlib cmap to colorize lines - Use either color to colorize the lines on a per class basis or - colormap to color them on a continuous scale. + cumulative : bool, default: True + Display the cumulative explained variance of components ordered by magnitude, + otherwise display each component's direct value. + + ratio : bool, default: True + Display the ratio of the component's explained variance to the total variance, + otherwise display the amount of variance. + + scale : bool, default: True + If true, the default PCA used by the visualizer has a standard scalar applied + to the data using the mean and standard deviation. This argument is ignored if + a user supplied transformer exists. + + n_components : int, default: None + Whether or not to limit the number of components whose variance is explained + in the user created transformer. This argument is ignored if a user supplied + transformer exists. + + is_fitted : bool, default=False + Specify if the user supplied transformer is already fitted. If False, the + transformer will be fit when the visualizer is fit, otherwise the transformer + will not be modified. Note that if a user supplied transformer is fitted, then + no additional calls to the visualizer ``fit()`` method is required (a unique + behavior of the ``ExplainedVariance`` visualizer). + + random_state : int, RandomState instance or None, optional (default None) + Set the random state on the underlying PCA solver. Note that if a user supplied + transformer exists, this parameter is ignored. kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. + Attributes + ---------- + explained_variance_ : array, shape (n_components,) + The amount of variance explained by each of the selected components. + Equal to n_components largest eigenvalues of the covariance matrix of X. + + explained_variance_ratio_ : array, shape (n_components,) + Percentage of variance explained by each of the selected components. + If n_components is not set then all components are stored and the sum of the + ratios is equal to 1.0. Examples -------- - >>> visualizer = ExplainedVariance() - >>> visualizer.fit(X) - >>> visualizer.transform(X) + >>> visualizer.fit_transform(X) >>> visualizer.show() + Notes + ----- + This visualizer wraps (by default) a sklearn.decomposition.PCA object which may have + many other learned attributes of interest, such as ``singular_values_`` or + ``noise_variance_``. To access these properties of the fitted underlying + decomposition, use the visualizer's ``transformer`` property. """ def __init__( self, ax=None, + transformer=None, + cumulative=True, + ratio=True, scale=True, - center=True, n_components=None, - colormap=palettes.DEFAULT_SEQUENCE, + is_fitted=False, + random_state=None, **kwargs ): - + # Initialize the visulizer super(ExplainedVariance, self).__init__(ax=ax, **kwargs) - self.colormap = colormap - self.n_components = n_components - self.center = center + # Set the transformer and drawing parameterws + self.cumulative = cumulative + self.ratio = ratio self.scale = scale - self.pipeline = Pipeline( - [ - ("scale", StandardScaler(with_mean=self.center, with_std=self.scale)), - ("pca", PCA(n_components=self.n_components)), - ] - ) - self.pca_features = None + self.n_components = n_components + self.is_fitted = is_fitted + self.random_state = random_state + + # NOTE: this parameter must be set last to initialize a new transformer + self.transformer = transformer + + # Keep track of internal state + self._drawn_on_fit = False @property - def explained_variance_(self): - return self.pipeline.steps[-1][1].explained_variance_ + def transformer(self): + """ + Returns the underlying transformer used for explained variance. + """ + return self._transformer + + @transformer.setter + def transformer(self, transformer): + """ + Creates a PCA pipeline using scaling and number of component if None is passed + in, otherwise sets the user supplied transformer for use in the visualization. + """ + if transformer is None: + # In this case we have to fit the underlying model, so ignore user + self.is_fitted = False + + # Create either the PCA transformer if none is supplied + transformer = PCA( + n_components=self.n_components, random_state=self.random_state + ) + + # Add a standard scaler if specified + if self.scale: + transformer = Pipeline([ + ("scale", StandardScaler(with_mean=True, with_std=True)), + ("pca", transformer) + ]) + + self._transformer = transformer def fit(self, X, y=None): - self.pipeline.fit(X) + """ + Fits the visualizer on X and transforms the data to plot it on the axes. + + Parameters + ---------- + X : array-like of shape (n, m) + A matrix or data frame with n instances and m features + + y : array-like of shape (n,), optional + A vector or series with target values for each instance in X. + Not used for ExplainedVariance but allowed here to support visual pipelines. + + Returns + ------- + self : ExplainedVariance + Returns the visualizer object. + """ + if not self.is_fitted: + self.transformer.fit(X) + + # Get the explained variance learned attributes from the transformer + self._set_explained_variance_attributes() self.draw() + + # Prevent duplicate drawing on calls to fit_transform() + self._drawn_on_fit = True return self - def transform(self, X): - self.pca_features = self.pipeline.transform(X) - return self.pca_features + def transform(self, X=None, y=None): + """ + Transform the data using the underlying transformer, which usually performs + dimensionality reduction on the imput features ``X``. This method can also be + called with a fitted model without passing data in order to draw the explained + variances without data. + + Parameters + ---------- + X : ndarray or DataFrame of shape n x m, optional + A matrix of n instances with m features. + + y : ndarray or Series of length n, optional + An array or series of target or class values. + Not used by the transformer, but supported to allow visual pipelines. + + Returns + ------- + Xp : ndarray or DataFrame of shape n x m + Returns a new array-like object of transformed features of shape + ``(len(X), self.n_components)``. + """ + if not self._drawn_on_fit: + # Draw on transform instead - note that this may change the attributes + self._set_explained_variance_attributes() + self.draw() + + if X is not None: + return self.transformer.transform(X) + return None def draw(self): - X = self.explained_variance_ - self.ax.plot(X) + if self.ratio: + if not hasattr(self, "explained_variance_ratio_"): + raise NotFitted(( + "transformer does not have the explained_variance_ratio_, " + "use ratio=False or ensure the visualizer is fitted." + )) + X = self.explained_variance_ratio_ + + else: + if not hasattr(self, "explained_variance_"): + raise NotFitted(( + "transformer does not have the explained_variance_, " + "use ratio=True or ensure the visualizer is fitted." + )) + X = self.explained_variance_ + + label = self.transformer.__class__.__name__ + if isinstance(self.transformer, Pipeline): + label = self.transformer.steps[-1][1].__class__.__name__ + + if self.cumulative: + X = np.cumsum(X) + self.ax.plot(X, label=label) + + # TODO: allow the user to specify the cutoff amounts + prev = 0 + for cutoff in [0.0, .50, .85, .95, .999]: + components = bisect.bisect_left(X, cutoff) + self.ax.fill_between(np.arange(0, components), 0, X[:components], color='b', alpha=min(1-cutoff+.2, 1), label="{:0.0f}%".format(cutoff*100)) + + else: + self.ax.plot(X, label=label) + + # TODO: visualize the amount of explained variance from each component + return self.ax def finalize(self, **kwargs): # Set the title - self.set_title("Explained Variance Plot") - - # Set the axes labels - self.ax.set_ylabel("Explained Variance") - self.ax.set_xlabel("Number of Components") + title = "Explained Variance" + if self.cumulative: + title = "Cumulative " + title + self.set_title(title) + + if self.ratio: + self.ax.set_ylabel("ratio of explained variance") + else: + self.ax.set_ylabel("explained variance") + + self.ax.set_xlabel("number of components") + self.ax.legend(loc="best", frameon=True) + + def _set_explained_variance_attributes(self): + """ + Helper function to discover the required attributes on the transformer. Does + not raise any exceptions if they cannot be found, but does not set the + attributes on the visualizer if they aren't. + """ + obj = self.transformer + if isinstance(obj, Pipeline): + obj = obj.steps[-1][1] + + for attr in ("explained_variance_", "explained_variance_ratio_"): + if hasattr(obj, attr): + setattr(self, attr, getattr(obj, attr)) ########################################################################## From 4d27883f0651b82bbd3a93080e73b170e4803122 Mon Sep 17 00:00:00 2001 From: Benjamin Bengfort Date: Thu, 5 Mar 2020 18:57:29 -0500 Subject: [PATCH 4/6] typo fixes, changelog update --- docs/changelog.rst | 13 ++++++++++++- yellowbrick/features/pca.py | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index 1f9fabf1f..20751be70 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -3,12 +3,23 @@ Changelog ========= +Under Development +----------------- + +Major Changes: + - New ``ExplainedVariance`` visualizer that assists in the selection of the number of + components for principal component analysis. The visualizer plots both cumulative + and discrete explained variance against the number of components and shades in + percentiles of total explained variance for visual selection of the best fit. + + + Version 1.1 ----------- * Tag: v1.1_ * Deployed Wednesday, February 12, 2020 -* Contributors: Benjamin Bengfort, Rebecca Bilbro, Kristen McIntyre, Larry Gray, Prema Roman, Adam Morris, Shivendra Sharma, Michael Chestnut, Michael Garod, Naresh Bachwani, Piyush Gautam, Daniel Navarrete, Molly Morrison, Emma Kwiecinska, Sarthak Jain, Tony Ojeda, Edwin Schmier, Nathan Danielsen +* Contributors: Benjamin Bengfort, Rebecca Bilbro, Kristen McIntyre, Larry Gray, Prema Roman, Adam Morris, Shivendra Sharma, Michael Chestnut, Michael Garod, Naresh Bachwani, Piyush Gautam, Daniel Navarrete, Molly Morrison, Emma Kwiecinska, Sarthak Jain, Tony Ojeda, Edwin Schmierer, Nathan Danielsen Major Changes: - Quick methods (aka Oneliners), which return a fully fitted finalized visualizer object in only a single line, are now implemented for all Yellowbrick Visualizers. Test coverage has been added for all quick methods. The documentation has been updated to document and demonstrate the usage of the quick methods. diff --git a/yellowbrick/features/pca.py b/yellowbrick/features/pca.py index 0210f9265..96a450dc5 100644 --- a/yellowbrick/features/pca.py +++ b/yellowbrick/features/pca.py @@ -471,7 +471,7 @@ def pca_decomposition( show=True, **kwargs ): - """PCA quick meethod. + """PCA quick method. Produce a two or three dimensional principal component plot of the data array ``X`` projected onto its largest sequential principal components. It is common practice From 043f8826536464eb435fb6875d4fd83f58432463 Mon Sep 17 00:00:00 2001 From: Benjamin Bengfort Date: Thu, 5 Mar 2020 19:07:17 -0500 Subject: [PATCH 5/6] remove old decomposition method --- yellowbrick/features/decomposition.py | 169 -------------------------- 1 file changed, 169 deletions(-) delete mode 100644 yellowbrick/features/decomposition.py diff --git a/yellowbrick/features/decomposition.py b/yellowbrick/features/decomposition.py deleted file mode 100644 index 3992f8b2a..000000000 --- a/yellowbrick/features/decomposition.py +++ /dev/null @@ -1,169 +0,0 @@ -# yellowbrick.features.decomposition -# -# Author: George Richardson -# Created: Fri Mar 2 16:16:00 2018 +0000 -# -# Copyright (C) 2016 The scikit-yb developers -# For license information, see LICENSE.txt -# -# ID: decomposition.py [] g.raymond.richardson@gmail.com $ - -########################################################################## -## Imports -########################################################################## -import numpy as np -import bisect - -from yellowbrick.style import palettes -from yellowbrick.features.base import FeatureVisualizer - -from sklearn.pipeline import Pipeline -from sklearn.decomposition import PCA -from sklearn.preprocessing import StandardScaler - -########################################################################## -## Quick Methods -########################################################################## - - -def explained_variance_visualizer( - X, - y=None, - ax=None, - scale=True, - center=True, - colormap=palettes.DEFAULT_SEQUENCE, - **kwargs -): - """Produce a plot of the explained variance produced by a dimensionality - reduction algorithm using n=1 to n=n_components dimensions. This is a single - plot to help identify the best trade off between number of dimensions - and amount of information retained within the data. - Parameters - ---------- - X : ndarray or DataFrame of shape n x m - A matrix of n rows with m features - y : ndarray or Series of length n - An array or Series of target or class values - ax : matplotlib Axes, default: None - The aces to plot the figure on - scale : bool, default: True - Boolean that indicates if the values of X should be scaled. - colormap : string or cmap, default: None - optional string or matplotlib cmap to colorize lines - Use either color to colorize the lines on a per class basis or - colormap to color them on a continuous scale. - kwargs : dict - Keyword arguments that are passed to the base class and may influence - the visualization as defined in other Visualizers. - Returns - ------- - viz : ExplainedVariance - Returns the fitted, finalized visualizer - Examples - -------- - >>> from sklearn import datasets - >>> bc = datasets.load_breast_cancer() - >>> X = bc = bc.data - >>> explained_variance_visualizer(X, scale=True, center=True, colormap='RdBu_r') - """ - - # Instantiate the visualizer - visualizer = ExplainedVariance(X=X) - - # Fit and transform the visualizer (calls draw) - visualizer.fit(X, y, **kwargs) - visualizer.transform(X) - visualizer.finalize() - - # Return the visualizer object - return visualizer - - -########################################################################## -## Explained Variance Feature Visualizer -########################################################################## - - -class ExplainedVariance(FeatureVisualizer): - """ - Parameters - ---------- - ax : matplotlib Axes, default: None - The aces to plot the figure on - scale : bool, default: True - Boolean that indicates if the values of X should be scaled. - colormap : string or cmap, default: None - optional string or matplotlib cmap to colorize lines - Use either color to colorize the lines on a per class basis or - colormap to color them on a continuous scale. - kwargs : dict - Keyword arguments that are passed to the base class and may influence - the visualization as defined in other Visualizers. - Examples - -------- - >>> visualizer = ExplainedVariance() - >>> visualizer.fit(X) - >>> visualizer.transform(X) - >>> visualizer.poof() - """ - - def __init__(self, n_components=None, ax=None, scale=True, center=True, - colormap=palettes.DEFAULT_SEQUENCE, cumulative=False, cutoff=95, - **kwargs): - - super(ExplainedVariance, self).__init__(ax=ax, **kwargs) - - self.colormap = colormap - self.n_components = n_components - self.center = center - self.scale = scale - self.pipeline = Pipeline( - [ - ("scale", StandardScaler(with_mean=self.center, with_std=self.scale)), - ("pca", PCA(n_components=self.n_components)), - ] - ) - self.pca_features = None - self.cumulative = cumulative - self.cutoff = cutoff - - @property - def explained_variance_(self): - return self.pipeline.steps[-1][1].explained_variance_ - - @property - def explained_variance_ratio_(self): - return self.pipeline.steps[-1][1].explained_variance_ratio_ - - def fit(self, X, y=None): - self.pipeline.fit(X) - self.draw() - return self - - def transform(self, X): - self.pca_features = self.pipeline.transform(X) - return self.pca_features - - def draw(self): - X = self.explained_variance_ratio_ - self.ax.plot(X, label = "Explained Variance") - if (self.cumulative): - X = np.cumsum(self.explained_variance_ratio_) - self.ax.plot(X, label = "Cumulative Variance") - - n_comp = bisect.bisect_left(X, self.cutoff/100); - self.ax.vlines(n_comp, 0, X[n_comp], linestyle = "dashed", - label=str(self.cutoff)+"% Variance") - self.ax.hlines(X[n_comp], 0, n_comp, linestyle = "dashed") - - return self.ax - - def finalize(self, **kwargs): - # Set the title - self.set_title("Explained Variance Plot") - - # Set the axes labels - self.ax.set_ylabel('Explained Variance') - self.ax.set_xlabel('Number of Components') - self.ax.legend(loc="center right") \ No newline at end of file From 6ec08c88215316d0a1099da39df1bf954513dafd Mon Sep 17 00:00:00 2001 From: Benjamin Bengfort Date: Tue, 10 Mar 2020 08:32:30 -0400 Subject: [PATCH 6/6] minor cleanup --- yellowbrick/features/explained_variance.py | 1 - 1 file changed, 1 deletion(-) diff --git a/yellowbrick/features/explained_variance.py b/yellowbrick/features/explained_variance.py index 151c8c9e1..83897fd4e 100644 --- a/yellowbrick/features/explained_variance.py +++ b/yellowbrick/features/explained_variance.py @@ -248,7 +248,6 @@ def draw(self): self.ax.plot(X, label=label) # TODO: allow the user to specify the cutoff amounts - prev = 0 for cutoff in [0.0, .50, .85, .95, .999]: components = bisect.bisect_left(X, cutoff) self.ax.fill_between(np.arange(0, components), 0, X[:components], color='b', alpha=min(1-cutoff+.2, 1), label="{:0.0f}%".format(cutoff*100))