From 37cdcb5c3c169d845c1d4f2974258ff2a4d85ba8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 2 Sep 2024 23:52:22 +0200 Subject: [PATCH 1/6] FEA add link for HTML representation --- skrub/_repr.py | 147 +++++++++++++++++++++++++++++++++++++ skrub/_table_vectorizer.py | 12 ++- 2 files changed, 158 insertions(+), 1 deletion(-) create mode 100644 skrub/_repr.py diff --git a/skrub/_repr.py b/skrub/_repr.py new file mode 100644 index 000000000..4e873f1b8 --- /dev/null +++ b/skrub/_repr.py @@ -0,0 +1,147 @@ +import itertools + +import sklearn +from sklearn.utils.fixes import parse_version + +from . import __version__ + +sklearn_version = parse_version(sklearn.__version__) + +if sklearn_version > parse_version("1.6"): + from sklearn.utils._estimator_html_repr import _HTMLDocumentationLinkMixin +else: + + class _HTMLDocumentationLinkMixin: + """Mixin class allowing to generate a link to the API documentation. + + This mixin relies on three attributes: + - `_doc_link_module`: it corresponds to the root module (e.g. `sklearn`). Using + this mixin, the default value is `sklearn`. + - `_doc_link_template`: it corresponds to the template used to generate the + link to the API documentation. Using this mixin, the default value is + `"https://scikit-learn.org/{version_url}/modules/generated/ + {estimator_module}.{estimator_name}.html"`. + - `_doc_link_url_param_generator`: it corresponds to a function that generates + the parameters to be used in the template when the estimator module and name + are not sufficient. + + The method :meth:`_get_doc_link` generates the link to the API documentation for + a given estimator. + + This useful provides all the necessary states for + :func:`sklearn.utils.estimator_html_repr` to generate a link to the API + documentation for the estimator HTML diagram. + + Examples + -------- + If the default values for `_doc_link_module`, `_doc_link_template` are not + suitable, then you can override them and provide a method to generate the URL + parameters: + >>> from sklearn.base import BaseEstimator + >>> doc_link_template = "https://website.com/{single_param}.html" + >>> def url_param_generator(estimator): + ... return {"single_param": estimator.__class__.__name__} + >>> class MyEstimator(BaseEstimator): + ... _doc_link_module = "builtins" + ... _doc_link_template = doc_link_template + ... _doc_link_url_param_generator = url_param_generator + >>> estimator = MyEstimator() + >>> estimator._get_doc_link() + 'https://website.com/MyEstimator.html' + + If instead of overriding the attributes inside the class definition, you want to + override a class instance, you can use `types.MethodType` to bind the method to + the instance: + >>> import types + >>> estimator = BaseEstimator() + >>> estimator._doc_link_template = doc_link_template + >>> estimator._doc_link_url_param_generator = types.MethodType( + ... url_param_generator, estimator) + >>> estimator._get_doc_link() + 'https://website.com/BaseEstimator.html' + """ + + _doc_link_module = "sklearn" + _doc_link_url_param_generator = None + + @property + def _doc_link_template(self): + sklearn_version = parse_version(sklearn.__version__) + if sklearn_version.dev is None: + version_url = f"{sklearn_version.major}.{sklearn_version.minor}" + else: + version_url = "dev" + return getattr( + self, + "__doc_link_template", + ( + f"https://scikit-learn.org/{version_url}/modules/generated/" + "{estimator_module}.{estimator_name}.html" + ), + ) + + @_doc_link_template.setter + def _doc_link_template(self, value): + setattr(self, "__doc_link_template", value) + + def _get_doc_link(self): + """Generates a link to the API documentation for a given estimator. + + This method generates the link to the estimator's documentation page + by using the template defined by the attribute `_doc_link_template`. + + Returns + ------- + url : str + The URL to the API documentation for this estimator. If the estimator + does not belong to module `_doc_link_module`, the empty string (i.e. + `""`) is returned. + """ + if self.__class__.__module__.split(".")[0] != self._doc_link_module: + return "" + + if self._doc_link_url_param_generator is None: + estimator_name = self.__class__.__name__ + # Construct the estimator's module name, up to the first private + # submodule. This works because in scikit-learn all public estimators + # are exposed at that level, even if they actually live in a private + # sub-module. + estimator_module = ".".join( + itertools.takewhile( + lambda part: not part.startswith("_"), + self.__class__.__module__.split("."), + ) + ) + return self._doc_link_template.format( + estimator_module=estimator_module, estimator_name=estimator_name + ) + return self._doc_link_template.format( + **self._doc_link_url_param_generator() + ) + + +doc_link_template = ( + "https://skrub-data.org/{version}/reference/generated/" + "{estimator_module}.{estimator_name}.html" +) +doc_link_module = "skrub" + + +def doc_link_url_param_generator(estimator): + skrub_version = parse_version(__version__) + if skrub_version.dev is None: + version_url = f"{skrub_version.major}.{skrub_version.minor}" + else: + version_url = "dev" + estimator_name = estimator.__class__.__name__ + estimator_module = ".".join( + itertools.takewhile( + lambda part: not part.startswith("_"), + estimator.__class__.__module__.split("."), + ) + ) + return { + "version": version_url, + "estimator_module": estimator_module, + "estimator_name": estimator_name, + } diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index f7dffffce..98df3b414 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -17,6 +17,12 @@ from ._datetime_encoder import DatetimeEncoder from ._gap_encoder import GapEncoder from ._on_each_column import SingleColumnTransformer +from ._repr import ( + _HTMLDocumentationLinkMixin, + doc_link_module, + doc_link_template, + doc_link_url_param_generator, +) from ._select_cols import Drop from ._to_datetime import ToDatetime from ._to_float32 import ToFloat32 @@ -110,7 +116,7 @@ def _check_transformer(transformer): return clone(transformer) -class TableVectorizer(TransformerMixin, BaseEstimator): +class TableVectorizer(_HTMLDocumentationLinkMixin, TransformerMixin, BaseEstimator): """Transform a dataframe to a numerical (vectorized) representation. Applies a different transformation to each of several kinds of columns: @@ -405,6 +411,10 @@ class TableVectorizer(TransformerMixin, BaseEstimator): ValueError: Column 'A' used twice in 'specific_transformers', at indices 0 and 1. """ # noqa: E501 + _doc_link_module = doc_link_module + _doc_link_template = doc_link_template + _doc_link_url_param_generator = doc_link_url_param_generator + def __init__( self, *, From c473d92a147680cfcb41d0886bfe28721d20f5ad Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 2 Sep 2024 23:56:43 +0200 Subject: [PATCH 2/6] fix import --- skrub/_repr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skrub/_repr.py b/skrub/_repr.py index 4e873f1b8..cf57d7957 100644 --- a/skrub/_repr.py +++ b/skrub/_repr.py @@ -3,8 +3,6 @@ import sklearn from sklearn.utils.fixes import parse_version -from . import __version__ - sklearn_version = parse_version(sklearn.__version__) if sklearn_version > parse_version("1.6"): @@ -128,6 +126,8 @@ def _get_doc_link(self): def doc_link_url_param_generator(estimator): + from skrub import __version__ + skrub_version = parse_version(__version__) if skrub_version.dev is None: version_url = f"{skrub_version.major}.{skrub_version.minor}" From ce62f692ff138d99d43544309b2c14d3db16001a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 3 Sep 2024 00:13:39 +0200 Subject: [PATCH 3/6] Add GapEncoder --- skrub/_gap_encoder.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/skrub/_gap_encoder.py b/skrub/_gap_encoder.py index 0805cfa88..d31db886b 100644 --- a/skrub/_gap_encoder.py +++ b/skrub/_gap_encoder.py @@ -19,10 +19,18 @@ from . import _dataframe as sbd from ._on_each_column import RejectColumn, SingleColumnTransformer +from ._repr import ( + _HTMLDocumentationLinkMixin, + doc_link_module, + doc_link_template, + doc_link_url_param_generator, +) from ._utils import unique_strings -class GapEncoder(TransformerMixin, SingleColumnTransformer): +class GapEncoder( + _HTMLDocumentationLinkMixin, TransformerMixin, SingleColumnTransformer +): """Constructs latent topics with continuous encoding. This encoder can be understood as a continuous encoding on a set of latent @@ -177,6 +185,10 @@ class GapEncoder(TransformerMixin, SingleColumnTransformer): The higher the value, the bigger the correspondence with the topic. """ + _doc_link_module = doc_link_module + _doc_link_template = doc_link_template + _doc_link_url_param_generator = doc_link_url_param_generator + def __init__( self, n_components=10, From 48878236ead19b44cdc4c5de94abfed9db3ba4c5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 3 Sep 2024 08:47:55 +0200 Subject: [PATCH 4/6] remove docstring to pass the test and add TODO for future removal --- skrub/_repr.py | 52 ++++---------------------------------------------- 1 file changed, 4 insertions(+), 48 deletions(-) diff --git a/skrub/_repr.py b/skrub/_repr.py index cf57d7957..02ac76ca0 100644 --- a/skrub/_repr.py +++ b/skrub/_repr.py @@ -5,59 +5,15 @@ sklearn_version = parse_version(sklearn.__version__) +# TODO: remove when scikit-learn 1.6 is the minimum supported version +# TODO: subsequently, we should remove the inheritance from _HTMLDocumentationLinkMixin +# for each estimator then. if sklearn_version > parse_version("1.6"): from sklearn.utils._estimator_html_repr import _HTMLDocumentationLinkMixin else: class _HTMLDocumentationLinkMixin: - """Mixin class allowing to generate a link to the API documentation. - - This mixin relies on three attributes: - - `_doc_link_module`: it corresponds to the root module (e.g. `sklearn`). Using - this mixin, the default value is `sklearn`. - - `_doc_link_template`: it corresponds to the template used to generate the - link to the API documentation. Using this mixin, the default value is - `"https://scikit-learn.org/{version_url}/modules/generated/ - {estimator_module}.{estimator_name}.html"`. - - `_doc_link_url_param_generator`: it corresponds to a function that generates - the parameters to be used in the template when the estimator module and name - are not sufficient. - - The method :meth:`_get_doc_link` generates the link to the API documentation for - a given estimator. - - This useful provides all the necessary states for - :func:`sklearn.utils.estimator_html_repr` to generate a link to the API - documentation for the estimator HTML diagram. - - Examples - -------- - If the default values for `_doc_link_module`, `_doc_link_template` are not - suitable, then you can override them and provide a method to generate the URL - parameters: - >>> from sklearn.base import BaseEstimator - >>> doc_link_template = "https://website.com/{single_param}.html" - >>> def url_param_generator(estimator): - ... return {"single_param": estimator.__class__.__name__} - >>> class MyEstimator(BaseEstimator): - ... _doc_link_module = "builtins" - ... _doc_link_template = doc_link_template - ... _doc_link_url_param_generator = url_param_generator - >>> estimator = MyEstimator() - >>> estimator._get_doc_link() - 'https://website.com/MyEstimator.html' - - If instead of overriding the attributes inside the class definition, you want to - override a class instance, you can use `types.MethodType` to bind the method to - the instance: - >>> import types - >>> estimator = BaseEstimator() - >>> estimator._doc_link_template = doc_link_template - >>> estimator._doc_link_url_param_generator = types.MethodType( - ... url_param_generator, estimator) - >>> estimator._get_doc_link() - 'https://website.com/BaseEstimator.html' - """ + """Mixin class allowing to generate a link to the API documentation.""" _doc_link_module = "sklearn" _doc_link_url_param_generator = None From b2284b848a82a459dd6da3599e30cd9ab625de04 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 3 Sep 2024 08:52:38 +0200 Subject: [PATCH 5/6] reduce diff using inheritance --- skrub/_gap_encoder.py | 13 ++----------- skrub/_repr.py | 12 +++++++++--- skrub/_table_vectorizer.py | 15 ++++----------- 3 files changed, 15 insertions(+), 25 deletions(-) diff --git a/skrub/_gap_encoder.py b/skrub/_gap_encoder.py index d31db886b..a36cd3f43 100644 --- a/skrub/_gap_encoder.py +++ b/skrub/_gap_encoder.py @@ -19,17 +19,12 @@ from . import _dataframe as sbd from ._on_each_column import RejectColumn, SingleColumnTransformer -from ._repr import ( - _HTMLDocumentationLinkMixin, - doc_link_module, - doc_link_template, - doc_link_url_param_generator, -) +from ._repr import _SkrubHTMLDocumentationLinkMixin from ._utils import unique_strings class GapEncoder( - _HTMLDocumentationLinkMixin, TransformerMixin, SingleColumnTransformer + _SkrubHTMLDocumentationLinkMixin, TransformerMixin, SingleColumnTransformer ): """Constructs latent topics with continuous encoding. @@ -185,10 +180,6 @@ class GapEncoder( The higher the value, the bigger the correspondence with the topic. """ - _doc_link_module = doc_link_module - _doc_link_template = doc_link_template - _doc_link_url_param_generator = doc_link_url_param_generator - def __init__( self, n_components=10, diff --git a/skrub/_repr.py b/skrub/_repr.py index 02ac76ca0..e320ffa07 100644 --- a/skrub/_repr.py +++ b/skrub/_repr.py @@ -5,9 +5,9 @@ sklearn_version = parse_version(sklearn.__version__) -# TODO: remove when scikit-learn 1.6 is the minimum supported version -# TODO: subsequently, we should remove the inheritance from _HTMLDocumentationLinkMixin -# for each estimator then. +# TODO: remove when scikit-learn 1.6 is the minimum supported version and only import +# We have this fix due to the following bug: +# https://github.com/scikit-learn/scikit-learn/pull/29774 if sklearn_version > parse_version("1.6"): from sklearn.utils._estimator_html_repr import _HTMLDocumentationLinkMixin else: @@ -101,3 +101,9 @@ def doc_link_url_param_generator(estimator): "estimator_module": estimator_module, "estimator_name": estimator_name, } + + +class _SkrubHTMLDocumentationLinkMixin(_HTMLDocumentationLinkMixin): + _doc_link_template = doc_link_template + _doc_link_module = doc_link_module + _doc_link_url_param_generator = doc_link_url_param_generator diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 98df3b414..47d2e8820 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -17,12 +17,7 @@ from ._datetime_encoder import DatetimeEncoder from ._gap_encoder import GapEncoder from ._on_each_column import SingleColumnTransformer -from ._repr import ( - _HTMLDocumentationLinkMixin, - doc_link_module, - doc_link_template, - doc_link_url_param_generator, -) +from ._repr import _SkrubHTMLDocumentationLinkMixin from ._select_cols import Drop from ._to_datetime import ToDatetime from ._to_float32 import ToFloat32 @@ -116,7 +111,9 @@ def _check_transformer(transformer): return clone(transformer) -class TableVectorizer(_HTMLDocumentationLinkMixin, TransformerMixin, BaseEstimator): +class TableVectorizer( + _SkrubHTMLDocumentationLinkMixin, TransformerMixin, BaseEstimator +): """Transform a dataframe to a numerical (vectorized) representation. Applies a different transformation to each of several kinds of columns: @@ -411,10 +408,6 @@ class TableVectorizer(_HTMLDocumentationLinkMixin, TransformerMixin, BaseEstimat ValueError: Column 'A' used twice in 'specific_transformers', at indices 0 and 1. """ # noqa: E501 - _doc_link_module = doc_link_module - _doc_link_template = doc_link_template - _doc_link_url_param_generator = doc_link_url_param_generator - def __init__( self, *, From 53093526d1d6191d8cb7cd77b2cfb6b0ab536723 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 3 Sep 2024 10:52:53 +0200 Subject: [PATCH 6/6] [doc build]