diff --git a/doc/_static/arrow.svg b/doc/_static/arrow.svg
new file mode 100644
index 000000000..664f1129c
--- /dev/null
+++ b/doc/_static/arrow.svg
@@ -0,0 +1 @@
+
diff --git a/doc/_static/css/custom.css b/doc/_static/css/custom.css
index 03102daa9..866a6f3e0 100644
--- a/doc/_static/css/custom.css
+++ b/doc/_static/css/custom.css
@@ -27,8 +27,8 @@
}
.card-body {
- padding-left: 32px;
- padding-right: 32px;
+ padding-left: 0px;
+ padding-right: 0px;
}
.card-title {
@@ -171,8 +171,7 @@ div.sk-landing-bg-more-info dd {
div.sk-landing-bg {
background-image: linear-gradient(160deg, rgba(0,48,70,.75) 0%, rgba(0,118,179,.75) 17%, rgba(255,239,193,.75) 59%, rgba(255,149,40,.75) 100%);
- margin-top: -18px !important;
- margin-bottom: 45px;
+ margin-bottom: 0px;
}
div.sk-landing-bg-more-info {
@@ -214,8 +213,8 @@ div.sk-landing-footer {
}
div.card-landing {
- margin-left: 5%;
- margin-right: 5%;
+ margin-left: 0%;
+ margin-right: 0%;
}
div.bd-sidebar-primary {
margin-right: 5%;
@@ -233,17 +232,101 @@ div.sk-landing-footer {
}
/** Scikit-learn buttons ***************************/
+
a.sk-btn-orange {
font-size: 1.1rem;
font-weight: 500;
- background-color: #f99f44; /* sk-orange-tint-1 */
- color: black !important;
+ background-color: #DA7007; /* Original sk-orange-tint-1 */
+ color: #ffffff; /* Changed to white for better contrast */
+ border: 2px solid #DA7007; /* Border to match background color, previous: f99f44 */
+ padding: 10px 20px; /* Added padding for better button size */
+ border-radius: 5px; /* Rounded corners for a modern look */
+ text-decoration: none; /* Remove underline */
+ transition: background-color 0.3s ease, color 0.3s ease; /* Smooth transition */
}
a.sk-btn-orange:hover {
- background-color: #fcb575; /* --sk-orange-tint-3 */
+ background-color: #fbe1ca; /* Fill with white on hover */
+ color: #DA7007; /* Text color changes to match the original background color */
+ border: 2px solid #DA7007; /* Border remains the same on hover */
+}
+
+.container {
+ max-width: 150ch;
+}
+
+.feature-title {
+ color: #DA7007; /* Darker text color for the title */
+ font-weight: 600; /* Bold font for title */
+ margin-bottom: 10px; /* Spacing below title */
+ margin-top: 0px;
+}
+
+.feature-text {
+ font-size: 1rem; /* Regular font size for text */
+ margin-bottom: 0; /* Remove default bottom margin */
}
+
+.hero-description {
+ text-align: left;
+ font-size: 1.25rem;
+ margin-bottom: 10px;
+}
+
+.click-table-report-hint {
+ color: #da4607;
+ font-size: 1.25rem;
+}
+
+
+.thin-line {
+ width: 100%;
+ height: 1px; /* Adjust the thickness of the line */
+ background-color: #12134b; /* Change the color as needed */
+ border: none; /* Ensures no border is applied */
+ margin-top: 20px;
+ margin-bottom: 20px;
+}
+
+@media (min-width: 768px) {
+ .thin-line {
+ margin-top: 50px;
+ margin-bottom: 50px;
+ }
+}
+
+.contributors-list {
+ display: flex;
+ flex-wrap: wrap;
+ gap: 30px; /* Adds space between contributors */
+}
+.contributor {
+ text-align: center;
+}
+.contributor img {
+ border-radius: 50%;
+ width: 80px;
+ height: 80px;
+}
+.contributor-name {
+ margin-top: 10px;
+ font-size: 14px;
+ font-weight: bold;
+}
+
+.home-footer {
+ background-color: #12134b; /* Black background color */
+ color: #fff; /* White text color */
+ text-align: center;
+ padding: 10px 0; /* Padding for the footer */
+ position: relative;
+ width: 100%;
+ bottom: 0; /* Sticks footer to the bottom */
+ margin-top: 50px;
+}
+
+
/* Download/laucher links and top hint (sphinx-gallery) */
.sphx-glr-download-link-note,
diff --git a/doc/_static/gap.png b/doc/_static/gap.png
new file mode 100644
index 000000000..380c40390
Binary files /dev/null and b/doc/_static/gap.png differ
diff --git a/doc/_templates/demo_gap_encoder.html b/doc/_templates/demo_gap_encoder.html
new file mode 100644
index 000000000..15469b35e
--- /dev/null
+++ b/doc/_templates/demo_gap_encoder.html
@@ -0,0 +1,9 @@
+
+
+
+
from skrub import GapEncoder
+gap = GapEncoder () . fit ( X [ "employee_position_title" ])
+encoded_labels = gap . transform ( X [ "employee_position_title" ] . head ())
+plt . imshow ( encoded_labels )
+
+
diff --git a/doc/_templates/demo_table_report_code.html b/doc/_templates/demo_table_report_code.html
new file mode 100644
index 000000000..829a4a63a
--- /dev/null
+++ b/doc/_templates/demo_table_report_code.html
@@ -0,0 +1,7 @@
+
+
+
+
from skrub import TableReport
+TableReport (df)
+
+
diff --git a/doc/_templates/demo_tabular_learner.html b/doc/_templates/demo_tabular_learner.html
new file mode 100644
index 000000000..ec6bd15d5
--- /dev/null
+++ b/doc/_templates/demo_tabular_learner.html
@@ -0,0 +1,118 @@
+
+
+
+
+ Given, a complex dataframe
+ df
: (expand for full code)
+
+
+
+from skrub.datasets import fetch_employee_salaries
+dataset = fetch_employee_salaries ()
+df = dataset . X
+y = dataset . y
+df
+
+
+
+
+
+
+
+
+
+
+ gender
+ department
+ department_name
+ division
+ assignment_category
+ employee_position_title
+ date_first_hired
+ year_first_hired
+
+
+
+
+ 0
+ F
+ POL
+ Department of Police
+ MSB Information Mgmt and...
+ Fulltime-Regular
+ Office Services Coordinator
+ 09/22/1986
+ 1986
+
+
+ 1
+ M
+ POL
+ Department of Police
+ ISB Major Crimes...
+ Fulltime-Regular
+ Master Police Officer
+ 09/12/1988
+ 1988
+
+
+ ...
+ ...
+ ...
+ ...
+ ...
+ ...
+ ...
+ ...
+ ...
+
+
+ 9226
+ M
+ CCL
+ County Council
+ Council Central Staff
+ Fulltime-Regular
+ Manager II
+ 09/05/2006
+ 2006
+
+
+ 9227
+ M
+ DLC
+ Department of Liquor Control
+ Licensure, Regulation...
+ Fulltime-Regular
+ Alcohol/Tobacco Enforcement Specialist II
+ 01/30/2012
+ 2012
+
+
+
+
+
+
+
+
from sklearn.model_selection import cross_val_score
+from skrub import tabular_learner
+cross_val_score ( tabular_learner ( 'regressor' ), df , y )
+
+
+
array([0.89370447, 0.89279068, 0.92282557, 0.92319094, 0.92162666])
+
+
+
diff --git a/doc/_templates/index.html b/doc/_templates/index.html
index c56be71e5..1394131fe 100644
--- a/doc/_templates/index.html
+++ b/doc/_templates/index.html
@@ -11,230 +11,204 @@
{# We add the full-width banner below the navbar, as the div there is
still full-width (unlike the article) #}
-
-
-
-
-
-
- Built for scikit-learn, Python
- Robust to dirty data
- Easy learning on pandas and polars dataframes
-
-
-
-
+
+
+
+
+
skrub
+ Prepping tables for machine learning
+
+
+
+ Pandas and Polars dataframes inputs and outputs
+ scikit-learn compatible
+ Work on heterogenous types (numeric, categorical, dates, text, missing values...)
+
+
+
+
{% endblock docs_navbar %}
{% block docs_main %}
-
-
+
+
+
+
skrub is a Python library to ease preprocessing and feature engineering for
+ tabular machine learning.
+
+ Our long-term goal is to directly connect
+ database tables to machine learning estimators.
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
fuzzy_join()
: Joining
-tables on non-normalized categories with approximate matching. Example
-
Joiner
,
-AggJoiner
: transformers for joining multiple tables together. Example
+
+
+
+
+
+
+
+
+
+
+
+
+ {% include "demo_tabular_learner.html" %}
+
-
-
-
-
TableVectorizer
:
-turn a pandas dataframe into a numerical array for
-machine learning. Example
-
GapEncoder
:
-OneHotEncoder but robust to typos or non-normalized categories. Example
+
+
+
+
+
+
+
+
Powerful Feature Engineering
+
Encode text and high cardinality categorical data with the
+ GapEncoder
+ and
+ MinHashEncoder
+ , and extract features from dates with the
+ DatetimeEncoder
+ .
+
+
+ {% include "demo_gap_encoder.html" %}
+
+
+
-
-
-
-
deduplicate()
: merge
-categories of similar morphology (spelling). Example
-
+
+
+
+
+
+
+
+
Interactive Data Exploration
+
Explore your dataframes interactively with
+ TableReport
+ .
+
+ {% include "demo_table_report_code.html" %}
+
+
Click anywhere on the table
+
+
+ {% include "demo_table_report_generated.html" %}
+
+
-
-
-
-
-
-
Less data wrangling, more machine learning
-
-
-
- tabular_learner
:
- easily create tabular-learning pipelines that wrangle complex dataframes.
-
-
-
- Given, a complex dataframe
- df
: (expand for full code)
-
-
-
->>> from skrub.datasets import fetch_employee_salaries
->>> dataset = fetch_employee_salaries ()
->>> df = dataset . X
->>> y = dataset . y
->>> df
-
-
-
-
-
-
-
-
-
-
- gender
- department
- department_name
- division
- assignment_category
- employee_position_title
- date_first_hired
- year_first_hired
-
-
-
-
- 0
- F
- POL
- Department of Police
- MSB Information Mgmt and...
- Fulltime-Regular
- Office Services Coordinator
- 09/22/1986
- 1986
-
-
- 1
- M
- POL
- Department of Police
- ISB Major Crimes...
- Fulltime-Regular
- Master Police Officer
- 09/12/1988
- 1988
-
-
- ...
- ...
- ...
- ...
- ...
- ...
- ...
- ...
- ...
-
-
- 9226
- M
- CCL
- County Council
- Council Central Staff
- Fulltime-Regular
- Manager II
- 09/05/2006
- 2006
-
-
- 9227
- M
- DLC
- Department of Liquor Control
- Licensure, Regulation...
- Fulltime-Regular
- Alcohol/Tobacco Enforcement Specialist II
- 01/30/2012
- 2012
-
-
-
-
-
-
-
-
->>> from sklearn.model_selection import cross_val_score
->>> from skrub import tabular_learner
->>> cross_val_score ( tabular_learner ( 'regressor' ), df , y )
-array([0.89370447, 0.89279068, 0.92282557, 0.92319094, 0.92162666])
+
+
+
+
Our community
+
+ The Skrub project is powered by the efforts of a world-wide community of contributors.
+
Here we display a randomly selected group of
+ 30 contributors.
+
See example
+
+
-{# So far, no article footer. Maybe this is brutal #}
-
-
{% endblock docs_main %}
{%- block footer %}
+
+
+
{%- endblock footer %}
diff --git a/doc/conf.py b/doc/conf.py
index 8749522ca..6b7fb208b 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -22,6 +22,12 @@
import warnings
from datetime import datetime
+# Generate the table report html file for the homepage
+sys.path.append(os.path.relpath("."))
+from table_report import generate_demo
+
+generate_demo()
+
# If extensions (or modules to document with autodoc) are in another
# directory, add these directories to sys.path here. If the directory
# is relative to the documentation root, use os.path.abspath to make it
@@ -178,6 +184,11 @@
"url": "https://github.com/skrub-data/skrub/",
"icon": "fa-brands fa-github",
},
+ {
+ "name": "Discord",
+ "url": "https://discord.gg/ABaPnm7fDC",
+ "icon": "fa-brands fa-discord",
+ },
],
# alternative way to set twitter and github header icons
# "github_url": "https://github.com/pydata/pydata-sphinx-theme",
diff --git a/doc/table_report.py b/doc/table_report.py
new file mode 100644
index 000000000..45f802d14
--- /dev/null
+++ b/doc/table_report.py
@@ -0,0 +1,8 @@
+from skrub import TableReport
+from skrub.datasets import fetch_employee_salaries
+
+
+def generate_demo():
+ X = fetch_employee_salaries().X
+ with open("_templates/demo_table_report_generated.html", "w") as f:
+ f.write(TableReport(X).html_snippet())