Simpler duplicates (#171)

* One find to rule them all * Update docs * Add uniques to report_all * Changes
scrapinghub · Oct 24, 2019 · 476a9dd · 476a9dd
1 parent f46c40e
commit 476a9dd
Show file tree

Hide file tree

Showing 9 changed files with 118 additions and 486 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -19,6 +19,7 @@ Note that the top-most release is changes in the unreleased master branch on Git
 - Added `outcome` property on Result, in order to define a rule outcome based on message levells. #173
 ### Changed
 - Reports rendering. Reports are being generated as HTML with a jinja2 template. `Arche.report_all()` displays the rules results grouped by outcome. The plots are displayed on the "plots" tab. #168
+- `report_all()` accepts `uniques` arg to find duplicates among columns/rows, #171
 
 
 ## [0.3.6] (2019-07-12)

diff --git a/docs/source/nbs/Rules.ipynb b/docs/source/nbs/Rules.ipynb
@@ -1,292 +1 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Rules"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This notebook contains rules used in the library with examples. Some rules executed during `Arche.report_all()`, and some are meant to be executed separately.\n",
-    "\n",
-    "Some definitions here are used interchangeably:\n",
-    "\n",
-    "* Rule - a test case for data. As a test case, it can be failed, passed or skipped. Some of the rules output only information like [Category fields](#Category-fields)\n",
-    "\n",
-    "* **df** - a dataframe which holds input data (from a job, collection or other source)\n",
-    "\n",
-    "* Scrapy cloud item - a row in a **df**\n",
-    "\n",
-    "* Items fields - columns in a **df**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import arche\n",
-    "from arche import *\n",
-    "from arche.readers.items import Items"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "items = Items.from_df(pd.read_csv(\"https://raw.githubusercontent.com/scrapinghub/arche/master/docs/source/nbs/data/items_products_8.csv\"))\n",
-    "target_items = Items.from_df(pd.read_csv(\"https://raw.githubusercontent.com/scrapinghub/arche/master/docs/source/nbs/data/items_products_7.csv\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = items.df.drop(columns=[\"_type\"])\n",
-    "target_df = target_items.df"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Accessing Graphs Data\n",
-    "The data is in `stats`. See `Result` class for more details."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "arche.rules.coverage.check_fields_coverage(df).stats"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Coverage\n",
-    "### Fields coverage on input data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "help(arche.rules.coverage.check_fields_coverage)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "arche.rules.coverage.check_fields_coverage(df).show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Anomalies"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "help(arche.rules.coverage.anomalies)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "res = arche.rules.coverage.anomalies(target=\"381798/2/4\", sample=[\"381798/2/8\", \"381798/2/7\", \"381798/2/6\"])\n",
-    "res.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Categories"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Category fields"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "help(arche.rules.category.get_categories)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "arche.rules.category.get_categories(df, max_uniques=200).show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Category coverage\n",
-    "In `report_all()`, these rules use `category` tag."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "help(arche.rules.category.get_coverage_per_category)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "arche.rules.category.get_coverage_per_category(df, [\"category\"]).show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "help(arche.rules.category.get_difference)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "arche.rules.category.get_difference(df, target_df, [\"category\"]).show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Compare\n",
-    "### Fields"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "help(arche.rules.compare.fields)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "arche.rules.compare.fields(df, target_df, [\"part_number\", \"name\", \"uom\"]).show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Duplicates\n",
-    "### Find duplicates by columns (fields)\n",
-    "This rule is not included in `Arche.report_all()`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "help(arche.rules.duplicates.find_by)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "arche.rules.duplicates.find_by(df, [\"name\", \"part_number\"]).show(short=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.3"
-  },
-  "widgets": {
-   "application/vnd.jupyter.widget-state+json": {
-    "state": {},
-    "version_major": 2,
-    "version_minor": 0
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
+{"cells":[{"cell_type":"markdown","metadata":{},"outputs":[],"source":["# Rules"]},{"cell_type":"markdown","metadata":{},"outputs":[],"source":["This notebook contains rules used in the library with examples. Some rules executed during `Arche.report_all()`, and some are meant to be executed separately.\n","\n","Some definitions here are used interchangeably:\n","\n","* Rule - a test case for data. As a test case, it can be failed, passed or skipped. Some of the rules output only information like [Category fields](#Category-fields)\n","\n","* **df** - a dataframe which holds input data (from a job, collection or other source)\n","\n","* Scrapy cloud item - a row in a **df**\n","\n","* Items fields - columns in a **df**"]},{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":"import arche\nfrom arche import *\nfrom arche.readers.items import Items"},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[],"source":["items = Items.from_df(pd.read_csv(\"https://raw.githubusercontent.com/scrapinghub/arche/master/docs/source/nbs/data/items_products_8.csv\"))\n","target_items = Items.from_df(pd.read_csv(\"https://raw.githubusercontent.com/scrapinghub/arche/master/docs/source/nbs/data/items_products_7.csv\"))"]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[],"source":["df = items.df.drop(columns=[\"_type\"])\n","target_df = target_items.df"]},{"cell_type":"markdown","metadata":{},"outputs":[],"source":["## Accessing Graphs Data\n","The data is in `stats`. See `Result` class for more details."]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[],"source":["arche.rules.coverage.check_fields_coverage(df).stats"]},{"cell_type":"markdown","metadata":{},"outputs":[],"source":["## Coverage\n","### Fields coverage on input data"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[],"source":["help(arche.rules.coverage.check_fields_coverage)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[],"source":["arche.rules.coverage.check_fields_coverage(df).show()"]},{"cell_type":"markdown","metadata":{},"outputs":[],"source":["### Anomalies"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[],"source":["help(arche.rules.coverage.anomalies)"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[],"source":["res = arche.rules.coverage.anomalies(target=\"381798/2/4\", sample=[\"381798/2/8\", \"381798/2/7\", \"381798/2/6\"])\n","res.show()"]},{"cell_type":"markdown","metadata":{},"outputs":[],"source":["## Categories"]},{"cell_type":"markdown","metadata":{},"outputs":[],"source":["### Category fields"]},{"cell_type":"code","execution_count":0,"metadata":{},"outputs":[],"source":["help(arche.rules.category.get_categories)"]},{"cell_type":"code","execution_count":0,"metadata":{},"outputs":[],"source":["arche.rules.category.get_categories(df, max_uniques=200).show()"]},{"cell_type":"markdown","metadata":{},"outputs":[],"source":["### Category coverage\n","In `report_all()`, these rules use `category` tag."]},{"cell_type":"code","execution_count":0,"metadata":{},"outputs":[],"source":["help(arche.rules.category.get_coverage_per_category)"]},{"cell_type":"code","execution_count":0,"metadata":{},"outputs":[],"source":["arche.rules.category.get_coverage_per_category(df, [\"category\"]).show()"]},{"cell_type":"code","execution_count":0,"metadata":{},"outputs":[],"source":["help(arche.rules.category.get_difference)"]},{"cell_type":"code","execution_count":0,"metadata":{},"outputs":[],"source":["arche.rules.category.get_difference(df, target_df, [\"category\"]).show()"]},{"cell_type":"markdown","metadata":{},"outputs":[],"source":["## Compare\n","### Fields"]},{"cell_type":"code","execution_count":0,"metadata":{},"outputs":[],"source":["help(arche.rules.compare.fields)"]},{"cell_type":"code","execution_count":0,"metadata":{},"outputs":[],"source":["arche.rules.compare.fields(df, target_df, [\"part_number\", \"name\", \"uom\"]).show()"]},{"cell_type":"markdown","metadata":{},"outputs":[],"source":"## Duplicates\n### Find duplicates by any combination of columns (fields)\nThis rule is executed when `uniques` is passed to `Arche.report_all()`."},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[],"source":["help(arche.rules.duplicates.find_by)"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[],"source":["arche.rules.duplicates.find_by(df, [\"uom\", [\"name\", \"part_number\"]]).show(short=True)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":""}],"nbformat":4,"nbformat_minor":2,"metadata":{"language_info":{"name":"python","codemirror_mode":{"name":"ipython","version":3}},"orig_nbformat":2,"file_extension":".py","mimetype":"text/x-python","name":"python","npconvert_exporter":"python","pygments_lexer":"ipython3","version":3}}
diff --git a/src/arche/arche.py b/src/arche/arche.py
@@ -1,6 +1,6 @@
 from functools import lru_cache
 import logging
-from typing import Iterable, Optional, Union, cast
+from typing import Iterable, List, Optional, Union, cast
 
 from arche.data_quality_report import DataQualityReport
 from arche.readers.items import Items, CollectionItems, JobItems, RawItems
@@ -124,12 +124,21 @@ def get_items(
     def save_result(self, rule_result):
         self.report.save(rule_result)
 
-    def report_all(self, short: bool = False) -> None:
+    def report_all(
+        self, short: bool = False, uniques: List[Union[str, List[str]]] = None
+    ) -> None:
+        """Report on all included rules.
+
+        Args:
+            uniques: see `arche.rules.duplicates.find_by`
+        """
+        if uniques:
+            self.uniques = uniques
         self.run_all_rules()
         IPython.display.clear_output()
         self.report(keys_limit=10 if short else None)
 
-    def run_all_rules(self):
+    def run_all_rules(self) -> None:
         if isinstance(self.source_items, JobItems):
             self.check_metadata(self.source_items.job)
             if self.target_items:
@@ -146,7 +155,6 @@ def data_quality_report(self, bucket: Optional[str] = None):
         IPython.display.clear_output()
         DataQualityReport(self.source_items, self.schema, self.report, bucket)
 
-    @lru_cache(maxsize=32)
     def run_general_rules(self):
         self.save_result(garbage_symbols(self.source_items.df))
         df = self.source_items.df
@@ -156,6 +164,10 @@ def run_general_rules(self):
             )
         )
         self.save_result(category_rules.get_categories(df))
+        if getattr(self, "uniques", None):
+            self.save_result(
+                duplicate_rules.find_by(self.source_items.df, self.uniques)
+            )
 
     def validate_with_json_schema(self) -> None:
         """Run JSON schema check and output results. It will try to find all errors, but
@@ -208,8 +220,7 @@ def run_schema_rules(self) -> None:
 
     def run_customized_rules(self, items, tagged_fields):
         self.save_result(price_rules.compare_was_now(items.df, tagged_fields))
-        self.save_result(duplicate_rules.find_by_unique(items.df, tagged_fields))
-        self.save_result(duplicate_rules.find_by_name_url(items.df, tagged_fields))
+        self.save_result(duplicate_rules.find_by_tags(items.df, tagged_fields))
         self.save_result(
             category_rules.get_coverage_per_category(
                 items.df, tagged_fields.get("category", []) + self.schema.enums