chore(examples): dataset compare examples (#1167)

* build(deps): bump cycjimmy/semantic-release-action from 2 to 3 (#1154) * chore(actions): disable lint when prs come from dependabot (#1164) * chore(actions): fix push and latest tag configs (#1166) * docs(changelogs): fix changelog format (#1163) * chore: move example files and add new hcc example Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Vasco Ramos <[email protected]>
ydataai · Nov 24, 2022 · 47d3aeb · 47d3aeb
1 parent 40a62b8
commit 47d3aeb
Show file tree

Hide file tree

Showing 11 changed files with 234 additions and 137 deletions.
diff --git a/.github/workflows/merge-dev.yml b/.github/workflows/merge-dev.yml
@@ -25,8 +25,8 @@ jobs:
       uses: oprypin/[email protected]
       with:
         repository: ${{ github.repository }}
-        regex: '^\d+\.\d+\.\d+'
-        releases-only: false
+        regex: '^v\d+\.\d+\.\d+'
+        releases-only: true
 
     - name: Extract semantic version
       id: semantic

diff --git a/.github/workflows/merge-master.yml b/.github/workflows/merge-master.yml
@@ -24,7 +24,7 @@ jobs:
       with:
         token: ${{ secrets.ACCESS_TOKEN }}
 
-    - uses: cycjimmy/semantic-release-action@v2
+    - uses: cycjimmy/semantic-release-action@v3
       id: semantic
       env:
         GITHUB_TOKEN: ${{ secrets.ACCESS_TOKEN }}
@@ -50,6 +50,7 @@ jobs:
     - uses: actions/checkout@v3
       with:
         fetch-depth: 0
+        token: ${{ secrets.ACCESS_TOKEN }}
 
     - name: Calculate changelog path
       id: path
@@ -71,11 +72,11 @@ jobs:
     - name: Add changelog to docs
       run: |
         cd docsrc/source/pages/reference
-        sed -i -e 's/# \[.*/# Changelog ${{ needs.prepare.outputs.release }}/g' \
-            -e 's/## Bug Fixes/## 🐛 Bug fixes/g' \
-            -e 's/## Features/## 🎉 Features/g' \
-            -e 's/## BREAKING CHANGES/## 🚨 Breaking changes/g' \
-            -e 's/## Documentation/## 📖 Documentation/g' \
+        sed -i -e 's/## \[.*/### Changelog ${{ needs.prepare.outputs.release }}/g' \
+            -e 's/### Bug Fixes/#### 🐛 Bug fixes/g' \
+            -e 's/### Features/#### 🎉 Features/g' \
+            -e 's/### BREAKING CHANGES/#### 🚨 Breaking changes/g' \
+            -e 's/### Documentation/#### 📖 Documentation/g' \
             changelog/${{ steps.path.outputs.value }}
 
         grep -q ".. include:: changelog/${{ steps.path.outputs.value }}" changelog.rst || sed -i "4 a\\
@@ -101,7 +102,6 @@ jobs:
       with:
         branch: ${{ github.ref }}
         github_token: ${{ secrets.ACCESS_TOKEN }}
-        force: true
 
 
   prerelease-tag:

diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
@@ -6,6 +6,7 @@ on:
 
 jobs:
   commitlint:
+    if: github.actor != 'dependabot[bot]'
     name: Lint commit message
     runs-on: ubuntu-latest
 
@@ -17,6 +18,7 @@ jobs:
     - uses: wagoid/commitlint-github-action@v5
 
   lint:
+    if: github.actor != 'dependabot[bot]'
     name: Lint source code
     runs-on: ubuntu-latest
 

diff --git a/docsrc/source/pages/reference/changelog/v3_3_1.rst b/docsrc/source/pages/reference/changelog/v3_3_1.rst
@@ -2,7 +2,7 @@ Changelog v3.3.1
 ----------------
 
 🐛 Bug fixes
-^^^^^^^^^^^
+^^^^^^^^^^^^
 
 -  remove unused imports
    (`66864c1 <https://github.com/ydataai/pandas-profiling/commit/66864c15cfa9b80cb426957e17410c579425d450>`__)

diff --git a/docsrc/source/pages/reference/changelog/v3_4_0.rst b/docsrc/source/pages/reference/changelog/v3_4_0.rst
@@ -1,8 +1,8 @@
 Changelog v3.4.0
-================
+----------------
 
 🐛 Bug fixes
------------
+^^^^^^^^^^^^
 
 -  correlation passing extra parameters
    (`#1114 <https://github.com/ydataai/pandas-profiling/issues/1114>`__)
@@ -29,7 +29,7 @@ Changelog v3.4.0
    (`985fbd1 <https://github.com/ydataai/pandas-profiling/commit/985fbd1fc0e826bda3ac1b725fa8842013743ab3>`__)
 
 🎉 Features
-----------
+^^^^^^^^^^^^
 
 -  add support for Pandas 1.5
    (`#1076 <https://github.com/ydataai/pandas-profiling/issues/1076>`__)

diff --git a/docsrc/source/pages/reference/changelog/v3_5_0.md b/docsrc/source/pages/reference/changelog/v3_5_0.md
@@ -1,7 +1,7 @@
-# Changelog v3.5.0
+### Changelog v3.5.0
 
 
-### 🐛 Bug fixes
+#### 🐛 Bug fixes
 
 * change context managed backend ([#1149](https://github.com/ydataai/pandas-profiling/issues/1149)) ([11e1a8a](https://github.com/ydataai/pandas-profiling/commit/11e1a8a3fa8d13513fe926b731fb907a066af2a1))
 * dataset names on comparison report ([#1159](https://github.com/ydataai/pandas-profiling/issues/1159)) ([3c14d43](https://github.com/ydataai/pandas-profiling/commit/3c14d438d9a557ac85f5663cc3446c0fb3081e18))
@@ -13,8 +13,8 @@
 * update repository links ([#1141](https://github.com/ydataai/pandas-profiling/issues/1141)) ([c742c5d](https://github.com/ydataai/pandas-profiling/commit/c742c5dbeb18fe2907a4c03792e8802993c46da5))
 
 
-### 🎉 Features
+#### 🎉 Features
 
 * add typechecking to profile report ([#1139](https://github.com/ydataai/pandas-profiling/issues/1139)) ([ec8ece0](https://github.com/ydataai/pandas-profiling/commit/ec8ece0de394eb4c2918bb6a74f0c5e5bb77ca61))
 * report comparison example ([#1160](https://github.com/ydataai/pandas-profiling/issues/1160)) ([5e75fd2](https://github.com/ydataai/pandas-profiling/commit/5e75fd275d14c8ce7ba49d0a15ec26810c4c0e73))
-* report comparisons ([#1069](https://github.com/ydataai/pandas-profiling/issues/1069)) ([70ee5c7](https://github.com/ydataai/pandas-profiling/commit/70ee5c776ad0c72d709631690a2df1cde5ca0424)), closes [#1137](https://github.com/ydataai/pandas-profiling/issues/1137) [#1136](https://github.com/ydataai/pandas-profiling/issues/1136) [#1143](https://github.com/ydataai/pandas-profiling/issues/1143) [#1148](https://github.com/ydataai/pandas-profiling/issues/1148) [#1150](https://github.com/ydataai/pandas-profiling/issues/1150)
+* report comparisons ([#1069](https://github.com/ydataai/pandas-profiling/issues/1069)) ([70ee5c7](https://github.com/ydataai/pandas-profiling/commit/70ee5c776ad0c72d709631690a2df1cde5ca0424)), closes [#1137](https://github.com/ydataai/pandas-profiling/issues/1137) [#1136](https://github.com/ydataai/pandas-profiling/issues/1136) [#1143](https://github.com/ydataai/pandas-profiling/issues/1143) [#1148](https://github.com/ydataai/pandas-profiling/issues/1148) [#1150](https://github.com/ydataai/pandas-profiling/issues/1150)
diff --git a/...ples/features/correlation_auto_example.py → examples/features/correlation_demo.py b/...ples/features/correlation_auto_example.py → examples/features/correlation_demo.py
diff --git a/examples/report_comparison/comparison.py → examples/features/eda_dataset_compare.py b/examples/report_comparison/comparison.py → examples/features/eda_dataset_compare.py
diff --git a/examples/hcc/eda-with-feature-comparison.ipynb b/examples/hcc/eda-with-feature-comparison.ipynb
@@ -0,0 +1,181 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Pandas Profiling: HCC Dataset\n",
+    "Source of data: https://www.kaggle.com/datasets/mrsantos/hcc-dataset\n",
+    "\n",
+    "As modifiations have been introduced for the purpose of this use case, the .csv file is provided (hcc.csv)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "from pandas_profiling import ProfileReport"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load the dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read the HCC Dataset\n",
+    "df = pd.read_csv(\"hcc.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Produce and save the profiling report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "original_report = ProfileReport(df, title=\"Original Data\")\n",
+    "original_report.to_file(\"original_report.html\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analysis of \"Alerts\"\n",
+    "Pandas Profiling alerts for the presence of 4 potential data quality problems:\n",
+    "\n",
+    "- `DUPLICATES`: 4 duplicate rows in data\n",
+    "- `CONSTANT`: Constant value “999” in ‘O2’\n",
+    "- `HIGH CORRELATION`: Several features marked as highly correlated\n",
+    "- `MISSING`: Missing Values in ‘Ferritin’\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Removing Duplicate Rows"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Drop duplicate rows\n",
+    "df_transformed = df.copy()\n",
+    "df_transformed = df_transformed.drop_duplicates()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Removing Irrelevant Features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Remove O2\n",
+    "df_transformed = df_transformed.drop(columns=\"O2\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Missing Data Imputation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Impute Missing Values\n",
+    "from sklearn.impute import SimpleImputer\n",
+    "\n",
+    "mean_imputer = SimpleImputer(strategy=\"mean\")\n",
+    "df_transformed[\"Ferritin\"] = mean_imputer.fit_transform(\n",
+    "    df_transformed[\"Ferritin\"].values.reshape(-1, 1)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Produce Comparison Report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "transformed_report = ProfileReport(df_transformed, title=\"Transformed Data\")\n",
+    "comparison_report = original_report.compare(transformed_report)\n",
+    "comparison_report.to_file(\"original_vs_transformed.html\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.8 ('feat-comp')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "13390b9b50dde76c6c011e02183633aae7d8498993a6e6577a16e1b7cb8c7a8c"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/hcc/eda-with-feature-comparison.py b/examples/hcc/eda-with-feature-comparison.py
@@ -0,0 +1,34 @@
+"""
+    Comparison report example for HCC dataset
+"""
+import pandas as pd
+from sklearn.impute import SimpleImputer
+
+from pandas_profiling import ProfileReport
+
+if __name__ == "__main__":
+
+    # Load the dataset
+    df = pd.read_csv("hcc.csv")
+
+    # Produce profile report
+    original_report = ProfileReport(df, title="Original Data")
+    original_report.to_file("original_report.html")
+
+    # Drop duplicate rows
+    df_transformed = df.copy()
+    df_transformed = df_transformed.drop_duplicates()
+
+    # Remove O2
+    df_transformed = df_transformed.drop(columns="O2")
+
+    # Impute Missing Values
+    mean_imputer = SimpleImputer(strategy="mean")
+    df_transformed["Ferritin"] = mean_imputer.fit_transform(
+        df_transformed["Ferritin"].values.reshape(-1, 1)
+    )
+
+    # Produce comparison report
+    transformed_report = ProfileReport(df_transformed, title="Transformed Data")
+    comparison_report = original_report.compare(transformed_report)
+    comparison_report.to_file("original_vs_transformed.html")